In [2]:
import pandas as pd
from preprocess import *
import random
random.seed(22)

In [3]:
# Import the users levels dataset which has been cleaned up
user_levels_df = pd.read_pickle('Data/users_levels_clean')

#Import the dataset containing the comments AFTER language detection
df = pd.read_pickle('Data/Classified/english_comments_2')

In [4]:
# merge with user levels to then separate native from non native speakers
df = df.merge(user_levels_df[['author', 'english']], left_on = 'author', right_on = 'author')

# separate native from non native speakers
natives_bool = (df['english'] == 'N')
natives = df[natives_bool]
non_natives = df[~natives_bool]

In [5]:
# Generate feeds for native and non native speakers separately 
native_feeds = generate_feeds(natives, nb_feeds = 20, nb_words_per_feed = 500, exact = False, seed = 0)
non_native_feeds = generate_feeds(non_natives, nb_feeds = 20, nb_words_per_feed = 500, exact = False, seed = 0)

print('The new datasets contains ',len(native_feeds), 'native and', len(non_native_feeds), 'non-native authors.')


100%|██████████| 653/653 [00:01<00:00, 631.98it/s]
100%|██████████| 325/325 [00:00<00:00, 807.11it/s]

The new datasets contains  354 native and 135 non-native authors.





In [6]:
# Save the newly generated dataframe into pickle file
native_feeds.to_pickle('Data/Feeds/native_english_20feeds')
non_native_feeds.to_pickle('Data/Feeds/non_native_english_20feeds')

In [7]:
# Create 2 dataframes for the tunning part 
# with 30 randomly picked natives for the first dataframe and 30 randomly picked non-natives for the other one.
tunning_samples_native     = random.sample(range(len(native_feeds)), 30)
tunning_samples_non_native = random.sample(range(len(non_native_feeds)), 30)

native_authors_tunning = native_feeds.iloc[tunning_samples_native]
non_native_authors_tunning = non_native_feeds.iloc[tunning_samples_non_native]

In [8]:
# Create 2 dataframes for the testing part
#  with 100 randomly picked natives for the first dataframe and 100 randomly picked non-natives for the other one.

# Drop the authors who have already been taken in the train set
test_native_feeds     = native_feeds.drop(index = native_authors_tunning.index)
test_non_native_feeds = non_native_feeds.drop(index = non_native_authors_tunning.index)

# Chose randomly 100 native authors and 100 non-native authors among the remaining
test_samples_native     = random.sample(range(len(test_native_feeds)), 100)
test_samples_non_native = random.sample(range(len(test_non_native_feeds)), 100)

native_authors_testing = test_native_feeds.iloc[test_samples_native]
non_native_authors_testing = test_non_native_feeds.iloc[test_samples_non_native]

In [9]:
# Save these new dataframes into a csv file
native_authors_tunning.to_csv('dataset/Tunning/30native_english.csv', sep = '\t')
non_native_authors_tunning.to_csv('dataset/Tunning/30non_native_english.csv', sep = '\t')
native_authors_testing.to_csv('dataset/Test/100native_english.csv', sep = '\t')
non_native_authors_testing.to_csv('dataset/Test/100non_native_english.csv', sep = '\t')

In [10]:
# Save these new dataframes into a pickle file
native_authors_tunning.to_parquet('dataset/Tunning/30native_english')
non_native_authors_tunning.to_parquet('dataset/Tunning/30non_native_english')
native_authors_testing.to_parquet('dataset/Test/100native_english')
non_native_authors_testing.to_parquet('dataset/Test/100non_native_english')