## Goal : find french and english comments in a given set of comments

In [None]:
# packages used afterwards
import pandas as pd
import numpy as np
from Language_Detection import *

In [None]:
# Load the dataframe containing the comments to classify

#list = [['et oui mais bon je voudrais pas dire dass ich nicht hier bin, weil du','Hello there, what are you doing?','Bonjour, eh oui c\'est pas poli','Check out TalkToMeInKorean. They at least HAD something like that, don\'t know their current product offering.','als du es dir vorgestellt hast.','Don\'t have to time to try it now, but I love your website (and app) design. Nice popping colors.'],[-1,-1,-1,-1,-1,-1]]
#initial_df = pd.DataFrame(list, index=['body', 'body_lang']).T
initial_df = pd.read_pickle('Data/processed_comments')


## Language classification using the package langdetect

In [None]:
# Perform langdetect classification (=python package)
langdetect_classification_df = langdetect_dataframe(initial_df,seed=4)
langdetect_classification_df.head(50)

In [None]:
# Save it to a pickle file
langdetect_classification_df.to_pickle('Data/Classified/langdetect_classification')

In [None]:
langdetect_classification_df[['body_lang']].value_counts()

## Remove undefined featuring urls and then perform a new language detection  

In [None]:
# read file saved after having run the previous part 
df = pd.read_pickle('Data/Classified/langdetect_classification')

In [None]:
# Remove all hyperlinks 
links = (df['body'].str.contains('http')) & (df['body_lang']=='U')
df1 = df[~links]

In [None]:
# Run the package detectlang again, only on undefined comments, with another seed
df2 = langdetect_dataframe(df1[df1['body_lang']=='U'],seed = 0)

In [None]:
# Save it to a pickle file
df1.to_pickle('Data/Classified/langdetect_classification_1')
df2.to_pickle('Data/Classified/langdetect_classification_2')

In [None]:
# Read the file we saved before
df2= pd.read_pickle('Data/Classified/langdetect_classification_2')

In [None]:
# Put the newly classified comments in the previous dataframe
df3 = df2.combine_first(df1)

In [None]:
df3[['body_lang']].value_counts()

In [None]:
df4_fr = df3[df3['body_lang']=='fr']
df4_en = df3[df3['body_lang']=='en']
df4_other = df3[df3['body_lang']=='N']
df4_undef = df3[df3['body_lang']=='U']

In [None]:
N = df1[['author']].nunique().item()
df_sum_fr = df4_fr.groupby('author').number_of_words.agg('sum')
df_sum_en = df4_en.groupby('author').number_of_words.agg('sum')
print("There were ",N," authors with >5000 words")
print("And after this classification, ")
print((df_sum_fr >= 5000).sum(), "  for french")
print((df_sum_en >= 5000).sum(), " for english")

In [None]:
# Save all frames to pickle files
df3.to_pickle('Data/Classified/langdetect_classification_3')
df4_fr.to_pickle('Data/Classified/french_comments')
df4_en.to_pickle('Data/Classified/english_comments')
df4_other.to_pickle('Data/Classified/other_language_comments')
df4_undef.to_pickle('Data/Classified/undefined_comments')


In [None]:
df3       = pd.read_pickle('Data/Classified/langdetect_classification_3')
df4_fr    = pd.read_pickle('Data/Classified/french_comments')
df4_en    = pd.read_pickle('Data/Classified/english_comments')
df4_other = pd.read_pickle('Data/Classified/other_language_comments')
df4_undef = pd.read_pickle('Data/Classified/undefined_comments')

## Human classification ##

**Be careful with the order of the cells!**
````{verbatim}
---> Begining : load a file containing undefined comments
^  v
^  v
^  v      ....classify.....
^  v
^  v
<--- End : overwrite new language attribution (over the previous 'U') for each comment you classified
 
So, be sure you run the last cells to not lose what you've just classified ! 

In [None]:
# LOAD DATA TO CLASSIFY 
# df_non_splitted = the whole set of comments (fr,en,U and N)
# df_to_classify  = the comments whose language is still undefined 
df_non_splitted = pd.read_pickle('Data/Classified/non_splitted_to_classify')
df_to_classify  = df_non_splitted[df_non_splitted['body_lang']=='U']

In [None]:
# Classify by hand until you're bored 
# Type : 0 for non english and non french,
#        1 for english
#        2 for french
# Type   e to exit the function 
df_partially_classified = human_class_df(df_to_classify)

In [None]:
# if wou wanrt to see what you classified (change the argument to see more lines)
df_partially_classified.head(50)

In [None]:
# Put what has been classified into the non splitted dataframe
df_non_splitted_new = df_partially_classified.combine_first(df_non_splitted)

In [None]:
# save new data to pickle files
df_non_splitted_new.to_pickle('Data/Classified/non_splitted_to_classify')