In [6]:
# Load the required modules
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from datetime import datetime
from tqdm import tqdm
import langdetect
import random
from preprocess import *


random.seed(22)
np.random.seed(22)

plt.rcParams['figure.figsize'] = [10, 5]

# 1. First step with the data, creation of the dataframe ready to be passed on langdetect module

Langdetect module take times to run over 1M comments and therefore we first sort out all the useless data (authors with less than 10000 words, authors with a flair not detected by the parser, author spamming the same comments and all the comments that are just a link)

In [7]:
# Import the dataset containing all the Reddit users 
levels_df = pd.read_csv('../../Data/Raw/user_levels.csv')

# Reduce the dataframe with usefull information
levels_df = levels_df[['author', 'N', 'A1', 'A2', 'B1', 'B2', 'C1', 'C2']].reset_index(drop = True)

print('The dataset contains', len(levels_df), 'users.')
levels_df.head(10)

The dataset contains 2144 users.


Unnamed: 0,author,N,A1,A2,B1,B2,C1,C2
0,jlba64,['fr'],[],[],[],[],[],[]
1,alexsteb,['de'],[],[],['ko'],[],['en'],[]
2,Noktilucent,['en'],[],"['de', 'it']",['es'],[],[],[]
3,makingthematrix,['pl'],[],[],[],[],[],[]
4,jammal20,['ar'],[],"['tr', 'es']",[],['he'],[],[]
5,kaeya_lilies,[],[],[],[],[],[],[]
6,pidgeon-eater-69,[],[],[],[],[],[],[]
7,Frenes,[],[],[],[],[],[],[]
8,himit,[],[],[],[],[],[],['ja']
9,ii_akinae_ii,['en'],[],[],['zh'],[],[],[]


In [None]:
# Put into a list of all the authors within the dataframe
authors = levels_df.author.tolist()

path_to_json = '../../Data/Raw/user_comments/'
frames = []

# Import all the comments of the authors within the dataframe and put them into the original dataframe
for author in authors:
    fpath = path_to_json + author + '.json'

    # Avoid empty file and suspended/deleted accounts
    if is_non_zero_file(fpath):
        df = pd.read_json(fpath)
        if (df.iloc[0][0] != 'suspended') and (df.iloc[0][0] != 'deleted'):
            frames.append(df)
        
comments_df = pd.concat(frames)
comments_df.head(5)

In [7]:
# Drop all duplicates because some commenters are spamming the same thing -->  biases for ML
comments_df.drop_duplicates(subset = ['author', 'body'], keep = 'first', inplace = True)

# Reduce the dataframe and count the number of words per author
comments_df = comments_df[['author', 'body', 'created_utc']]
comments_df['number_of_words'] = comments_df['body'].str.split().str.len()

# Keep only authors we more than 10'000 words written
comments_df_sum = comments_df[['author', 'number_of_words']].groupby('author').agg('sum')
kept_authors = comments_df_sum[comments_df_sum.number_of_words >= 10000].index.tolist()
comments_df = comments_df[comments_df['author'].isin(kept_authors)]

# Remove all the comments with only a link and nothing else
link = ((comments_df.number_of_words <= 1) & (comments_df.body.str[:4] == 'http'))
comments_df = comments_df[~link]

# Reset the index
comments_df.reset_index(inplace = True, drop = True)

# Save the dataframe
comments_df.to_pickle('../../Data/Preprocessing/processed_comments_all_flairs.pkl')

comments_df.head()

Unnamed: 0,author,body,created_utc,number_of_words,body_lang
0,alexsteb,Check out TalkToMeInKorean. They at least HAD ...,1666878614,16,
1,alexsteb,"It's a toss up between Japanese, Korean and Ma...",1666874877,49,
2,alexsteb,"I've seen something like those also in Göreme,...",1666793085,14,
3,alexsteb,Wrong tones are like wrong vowels. It's still ...,1666767494,21,
4,alexsteb,\*als du es dir vorgestellt hast.,1666767139,6,


# 2. Language detection

In [None]:
# Import the processed df 
comments_lang = pd.read_pickle('../../Data/Preprocessing/processed_comments_all_flairs.pkl')

# put all the comments into a list of string
comments = comments_lang.body.to_list()

# For all comments, put in the column body_lan the language the the highest probability. 
# If no language is detected, put 'U' as undefined

language = []
for comment in tqdm(comments):
    try: 
        langs = langdetect.detect_langs(comment)
        language.append(langs[0].lang)
    except:
        language.append('U')

# Add the column with the language for each comments
comments_lang['body_lang'] = language

# Save the new dataframe 
comments_lang.to_pickle('../../Data/Preprocessing/langdetect_classification_all.pkl')

comments_lang.head()

# 3. Keep only the authors with more than 10 000 words written in english and generate feeds

> * It appeared that after language detection, English was the only language with enougth comments to work on. All the others language will > then be dropped. 
> * It also appeared than there is not enough of authors writing in english per proficiency, we will thus use 'Native' and 'Learners' for 
> our research question.
> * Feeds are documents containing ~ 500 words for each authors in which the features will be developped. 


In [43]:
# Import the users levels dataset which has been cleaned up
user_df = pd.read_csv('../../Data/Raw/user_levels.csv')

#Import the dataset containing the comments AFTER language detection
classified_comments_df = pd.read_pickle('Data/Preprocessing/english_comments_2') # This file contains the results of language detection with langdetect completed by hands

# Merge the two datasets
comments_user_df = pd.merge(classified_comments_df, user_df, left_on= 'author', right_on= 'author').drop(['flair', 'Unnamed: 0'], axis = 1)

# Drop all the comments that are not detected to be in English
comments_user_df = comments_user_df[comments_user_df.body_lang == 'en']

comments_user_df.head()

Unnamed: 0,author,body,body_lang,created_utc,number_of_words,N,A1,A2,B1,B2,C1,C2
0,alexsteb,Check out TalkToMeInKorean. They at least HAD ...,en,1666879000.0,16.0,['de'],[],[],['ko'],[],['en'],[]
1,alexsteb,"It's a toss up between Japanese, Korean and Ma...",en,1666875000.0,49.0,['de'],[],[],['ko'],[],['en'],[]
2,alexsteb,"I've seen something like those also in Göreme,...",en,1666793000.0,14.0,['de'],[],[],['ko'],[],['en'],[]
3,alexsteb,Wrong tones are like wrong vowels. It's still ...,en,1666767000.0,21.0,['de'],[],[],['ko'],[],['en'],[]
4,alexsteb,"Don't have to time to try it now, but I love y...",en,1666702000.0,19.0,['de'],[],[],['ko'],[],['en'],[]


In [44]:
# Keep only authors with a known proficiency in English 
proficiency = []
levels = ['N', 'A1', 'A2', 'B1', 'B2', 'C1', 'C2']
for index, row in comments_user_df.iterrows():
    counter = 0
    for col in levels:
        if ('en' in row[col]) | ('En' in row[col]) | ('eN' in row[col]) | ('EN' in row[col]):
            proficiency.append(col)
            break
        counter += 1

        if counter == 7:
            proficiency.append('None')

# Add the column proficiency and drop the levels columns
comments_user_df['proficiency'] = proficiency
comments_user_df.drop(levels, axis = 1, inplace=True)

# Drop the line whithout a known proficiency in english
comments_user_df = comments_user_df[comments_user_df.proficiency != 'None']

comments_user_df.head()

Unnamed: 0,author,body,body_lang,created_utc,number_of_words,proficiency
0,alexsteb,Check out TalkToMeInKorean. They at least HAD ...,en,1666879000.0,16.0,C1
1,alexsteb,"It's a toss up between Japanese, Korean and Ma...",en,1666875000.0,49.0,C1
2,alexsteb,"I've seen something like those also in Göreme,...",en,1666793000.0,14.0,C1
3,alexsteb,Wrong tones are like wrong vowels. It's still ...,en,1666767000.0,21.0,C1
4,alexsteb,"Don't have to time to try it now, but I love y...",en,1666702000.0,19.0,C1


In [49]:
# Split the dataset into native and non-native authors
bool_native = (comments_user_df.proficiency == 'N')
natives_df = comments_user_df[bool_native]
learners_df = comments_user_df[~bool_native]

# Generate feeds for native and non native speakers separately 
native_feeds = generate_feeds(natives_df, nb_feeds = 20, nb_words_per_feed = 500, exact = False, seed = 0)
non_native_feeds = generate_feeds(learners_df, nb_feeds = 20, nb_words_per_feed = 500, exact = False, seed = 0)

print('The new datasets contains ',len(native_feeds), 'native and', len(non_native_feeds), 'non-native authors left for the developpment and evaluation stage.')

100%|██████████| 640/640 [00:01<00:00, 591.33it/s]
100%|██████████| 279/279 [00:00<00:00, 872.77it/s]

The new dataset contains 354 native and 135 non-native authors left for the development and evaluation stage.





In [None]:
# Save the newly generated dataframe into pickle file
native_feeds.to_pickle('../../Data/Preprocessing/native_english_20feeds')
non_native_feeds.to_pickle('../../Data/Preprocessing/non_native_english_20feeds')

# 4. Split the cohorts for the developpment and evaluation stage

In [50]:
# Load the file just created
native_feeds = pd.read_pickle('../../Data/Preprocessing/native_english_20feeds')
non_native_feeds = pd.read_pickle('../../Data/Preprocessing/non_native_english_20feeds')

In [None]:
# Create 2 dataframes for the developpment stage 
# with 30 randomly picked natives for the first dataframe and 30 randomly picked non-natives for the other one.
tunning_samples_native     = random.sample(range(len(native_feeds)), 30)
tunning_samples_non_native = random.sample(range(len(non_native_feeds)), 30)

native_authors_tunning = native_feeds.iloc[tunning_samples_native]
non_native_authors_tunning = non_native_feeds.iloc[tunning_samples_non_native]

# Save these new dataframes into a parquet file (in order to be tuned on SCITAS)
native_authors_tunning.to_parquet('../../Data/Tuning/30native_english')
non_native_authors_tunning.to_parquet('../../Data/Tuning/30non_native_english')

In [None]:
# Create 2 dataframes for the evluation stage
#  with 100 randomly picked natives for the first dataframe and 100 randomly picked non-natives for the other one.

# Drop the authors who have already been taken in the train set
test_native_feeds     = native_feeds.drop(index = native_authors_tunning.index)
test_non_native_feeds = non_native_feeds.drop(index = non_native_authors_tunning.index)

# Chose randomly 90 native authors and 90 non-native authors among the 100 remaining
test_samples_native     = random.sample(range(len(test_native_feeds)), 90)
test_samples_non_native = random.sample(range(len(test_non_native_feeds)), 90)

for i in range(3):
    native_authors_testing = test_native_feeds.iloc[test_samples_native[30*i:30*(i+1)]]
    non_native_authors_testing = test_non_native_feeds.iloc[test_samples_non_native[30*i:30*(i+1)]]
    native_authors_testing.to_pickle(f'../../Data/Test/30native_english{i+1}.pkl')
    non_native_authors_testing.to_pickle(f'../../Data/Test/30non_native_english{i+1}.pkl') 