# **Setup**

In [1]:
import pandas as pd
import numpy as np

from cleaning import convert      #function that turns data file into pandas dataframe
#from cleaning import sized_export

from langdetect import detect   #function that detects language of text

import matplotlib.pyplot as plt

In [2]:
np.random.seed(42)   #set seed for replicability (also applies to pandas functionality)

# **Conversion Of Data into Pandas Dataframe**

Here we import datasets and turn them into pandas dataframes using the convert function found in cleaning.py

In [3]:
root = "C:/Users/Bijan-PC/Documents/Coding/UNIL/Data Analysis/ADA_Project/ADA_Final/dat_raw"      #base location for where all raw data files are
clean_root = "C:/Users/Bijan-PC/Documents/Coding/UNIL/Data Analysis/ADA_Project/ADA_Final/dat_cleaned"      #base location for where all cleaned data files will be saved

twit_dta = convert(f"{root}/emotions","text.csv")        #converts twitter csv file into pandas dataframe

reddit_dta = convert(f"{root}/reddit", 'Reddit_Data.csv')

yelp_dta_train = convert(f"{root}/reviews/Yelp",'train-00000-of-00001.parquet')         #yelp data came as pre-split, they will get merged in this notebook
yelp_dta_test = convert(f"{root}/reviews/Yelp",'test-00000-of-00001.parquet')

imdb_dta = convert(f"{root}/reviews/IMDB",'IMDB Dataset.csv')

# *Twitter Posts*

The emotions are classified into six categories: sadness (0), joy (1), love (2), anger (3), fear (4), and surprise (5)

In [10]:
twit_df = twit_dta.drop('Unnamed: 0', axis =1)          #drops column 'Unnamed: 0' which stored id for each twitter post
twit_df['platform'] = 'Twitter'
twit_df['data type'] = 'Social Media'                   #Adding columns to further describe data type and task
twit_df['classification task'] = 'Multi-Class'

Now we will balance this dataset


In [11]:
twit_df['label'].value_counts()      #Clearly unbalanced

label
1    141067
0    121187
3     57317
4     47712
2     34554
5     14972
Name: count, dtype: int64

In [12]:
a = twit_df.groupby('label',group_keys=False)
twit_df = pd.DataFrame(a.apply(lambda x: x.sample(a.size().min()).reset_index(drop=True)))

  twit_df = pd.DataFrame(a.apply(lambda x: x.sample(a.size().min()).reset_index(drop=True)))


In [13]:
twit_df['label'].value_counts()      #balanced

label
0    14972
1    14972
2    14972
3    14972
4    14972
5    14972
Name: count, dtype: int64

In [33]:
twit_df["text"].apply(len).mean()   #Check average length of text

98.23113144536468

In [14]:
#export balanced dataset in csv format
twit_df.to_csv(clean_root + '/full/twit_clean.csv', index=False)  

# *Reddit Comments*

Original Dataset: \
0 Indicating a Neutral Comment \
1 Indicating a Postive comment \
-1 Indicating a Negative Comment 

In [4]:
reddit_dta = reddit_dta.rename(columns={'clean_comment': 'text', 'category': 'label'})  

In [5]:
reddit_dta['label'] = (reddit_dta['label']+1)   #add one to each label. necessary for BERT as labels with negative values lead to issues

In [6]:
def detect_langue(text): #function to detect language          
   try:
       return detect(text)      #detect function imported from langdetect package
   except:
       return "unknown"

In [7]:
reddit_dta['language'] = reddit_dta['text'].apply(detect_langue) 

In [8]:
reddit_dta = reddit_dta[reddit_dta['language'] == 'en']            #keeps only english text
reddit_dta = reddit_dta.drop('language', axis =1)               #drops language column

reddit_dta['platform'] = "Reddit"            #Adds column Platform
reddit_dta['data type'] = 'Social Media'         #Adds column Data Type
reddit_dta['classification task'] = 'Multi-Class'

Now we balance the dataset

In [9]:
reddit_dta['label'].value_counts()      #Clearly unbalanced

label
2    14951
1     8818
0     7865
Name: count, dtype: int64

In [15]:
a = reddit_dta.groupby('label',group_keys=False)
reddit_df = pd.DataFrame(a.apply(lambda x: x.sample(a.size().min()).reset_index(drop=True)))

  reddit_df = pd.DataFrame(a.apply(lambda x: x.sample(a.size().min()).reset_index(drop=True)))


In [16]:
reddit_df['label'].value_counts()      #Now it's balanced

label
0    7865
1    7865
2    7865
Name: count, dtype: int64

In [20]:
reddit_df["text"].apply(len).mean()   #Check average length of text

187.18198770926043

In [None]:
reddit_df.to_csv(clean_root + '/full/reddit_clean.csv', index=False)  #export cleaned data to dat_cleaned/full

# *IMDB Reviews*

In [17]:
imdb_dta.loc[imdb_dta['sentiment'] == 'positive', 'label'] = 1      #Maps sentiment column (string) into binary values (int)
imdb_dta.loc[imdb_dta['sentiment'] == 'negative', 'label'] = 0

imdb_dta['label'] = imdb_dta['label'].astype(int)                   #Turns label float values into integers

imdb_dta = imdb_dta.drop(['sentiment'], axis=1)                     #Drop sentiment column

In [18]:
imdb_dta = imdb_dta.rename(columns={'review': 'text'})             #rename review column to text
imdb_dta['platform'] = "IMDB"            #Adds column platform
imdb_dta['data type'] = 'Review'         #Adds column data Type
imdb_dta['classification task'] = 'Binary'      #Adds column classification task
imdb_df = imdb_dta[['text','label', 'platform', 'data type', 'classification task']]        #re-order columns

IMDB dataset is already balanced

In [19]:
imdb_df['label'].value_counts()

label
1    25000
0    25000
Name: count, dtype: int64

In [22]:
imdb_df["text"].apply(len).mean()   #Check average length of text

1309.43102

In [None]:
imdb_df.to_csv(clean_root + '/full/imdb_clean.csv', index=False)        #export cleaned data to dat_cleaned/full

# *Yelp Reviews*

In [24]:
yelp_df = pd.concat([yelp_dta_test, yelp_dta_train])                #combines pre-split train and test data into one dataset. Splitting will be done later
yelp_df = yelp_df.reset_index(drop=True)                                      #restores indices post concatination

In [25]:
def detect_langue(text): #function to detect language          
   try:
       return detect(text)      #detect function imported from langdetect package
   except:
       return "unknown"

In [26]:
#yelp_df['language'] =  yelp_lan.tolist()                       #can see what languages are present in dataset
yelp_df['language'] = yelp_df['text'].apply(detect_langue)      #adds new language column to dataframe using detect_langue function above.
yelp_df = yelp_df[yelp_df['language'] == 'en']            #keeps only english text
yelp_df = yelp_df.drop('language', axis =1)               #drops language column

yelp_df['platform'] = "Yelp"            #Adds column Platform
yelp_df['data type'] = 'Review'         #Adds column Data Type
yelp_df['classification task'] = 'Multi-Class'


In [27]:
yelp_df = yelp_df[['text','label','platform', 'data type', 'classification task']]                 #changes order of columns to the same as other dataframes

Now we balance the dataset

In [28]:
yelp_df['label'].value_counts()      #Clearly unbalanced

label
0    139213
1    139050
4    138681
2    138655
3    138477
Name: count, dtype: int64

In [29]:
a = yelp_df.groupby('label',group_keys=False)
yelp_df = pd.DataFrame(a.apply(lambda x: x.sample(a.size().min()).reset_index(drop=True)))

  yelp_df = pd.DataFrame(a.apply(lambda x: x.sample(a.size().min()).reset_index(drop=True)))


In [30]:
yelp_df['label'].value_counts()     #Now balanced

label
0    138477
1    138477
2    138477
3    138477
4    138477
Name: count, dtype: int64

In [31]:
yelp_df["text"].apply(len).mean()   #Check average length of text

733.8679982957459

In [None]:
yelp_df.to_csv(clean_root + '/full/yelp_clean.csv', index=False)  #export cleaned data to dat_cleaned/full

# **Exporting different sized datasets for each platform**

#### Now that the datasets are balanced, we can export different sized datasets. With the reddit dataset having the lowest number of observations for each label (7864), we will not exceed this number for any exports in order to make them comparable to each other


## 7864 Observations (Large)

Note: I define the function sized_export here as well as in cleaning.py \
For some reason, when I use the imported verson for cleaning.py, the new csv file gets saved as "None.csv" but this issue doesn't occur when I define the function locally. I couldn't figure it out so I left it defined here

In [None]:
def sized_export(df, size):
    def get_var_name(var):                          #function for getting the name of a variable in string format
        for name, value in globals().items():
            if value is var:
                return name
    clean_root = "C:/Users/Bijan-PC/Documents/Coding/UNIL/Data Analysis/ADA_Project/ADA_Final/dat_cleaned"      #base location for where all cleaned data files are saved

    df_name = get_var_name(df)

    rows = df.groupby('label',group_keys=False)

    new_df = pd.DataFrame(rows.apply(lambda x: x.sample(size, random_state = 2020).reset_index(drop=True)))
    
    ass = new_df['label'].value_counts()      #Clearly unbalanced

    print(ass)
    return new_df.to_csv(f"{clean_root}/{size}" + f'/{df_name}_{size}.csv', index=False)  

In [None]:
size_large = 7852

sized_export(twit_df, size_large)           #Exporting large csv files for the 4 data sets
sized_export(reddit_df, size_large)
sized_export(imdb_df, size_large)
sized_export(yelp_df, size_large)

## 3500 Observations (Medium)

In [None]:
size_medium = 3500

sized_export(twit_df, size_medium)           #Exporting medium csv files for the 4 data sets
sized_export(reddit_df, size_medium)    
sized_export(imdb_df, size_medium)
sized_export(yelp_df, size_medium)

## 500 Observations (Small)

In [None]:
size_small = 500

sized_export(twit_df, size_small)              #Exporting small csv files for the 4 data sets
sized_export(reddit_df, size_small)
sized_export(imdb_df, size_small)
sized_export(yelp_df, size_small)