# Downloading necessary libraries

In [385]:
import pandas as pd
import plotly.express as px
from sklearn.utils import shuffle

In [386]:
def num_of_words(df: pd.DataFrame)-> int:
    '''
    :param df: a data frame with sentences in column "post"
    :type df: a data frame
    :returns: the number of words in the dataset
    :rtype: int
    '''
    count = 0
    iterator = df['post']
    for i in iterator:
        count += len(i.split())
    return count

def num_of_cars(df: pd.DataFrame)-> int:
    '''
    :param df: a data frame with sentences in column "post"
    :type df: a data frame
    :returns: the number of carachters in the dataset
    :rtype: int
    '''
    count = 0
    iterator = df['post']
    for i in iterator:
        count+= len([j for j in i])
    return count

def avg_num_of_words(df: pd.DataFrame)-> int:
    '''
    :param df: a data frame with sentences in column "post"
    :type df: a data frame
    :returns: the average number of words in the dataset
    :rtype: int
    '''
    return round(num_of_words(df)/len(df))


def avg_num_of_cars(df: pd.DataFrame) -> int:
    '''
    :param df: a data frame with sentences in column "post"
    :type df: a data frame
    :returns: the average number of carachters in the dataset
    :rtype: int
    '''
    return round(num_of_cars(df)/len(df))

def num_of_cat(df: pd.DataFrame) -> int:
    '''
    :param df: a data frame
    :type df: a data frame
    :returns: the number of categories in the dataset
    :rtype: int
    '''
    last_col = df.columns[-1]
    return df[last_col].nunique()

def missing_values(df: pd.DataFrame) -> str:
    '''
    :param df: a data frame
    :type df: a data frame
    :returns: if there are any missing values in the dataset
    :rtype: str
    '''
    count = df.isnull().sum().sum()
    if count == 0:
        return print(f'There are no missing values!')
    return print(f'There are missing values!')

def return_vis(df: pd.DataFrame, x_label: str, y_label: str, title: str) -> px.bar:
    '''
    :param df: a data frame with target and feature columns
    :type df: a data frame
    :param x_label: name for the x-axis
    :type x_label: str
    :param y_label: name for the y-axis
    :type y_label: str
    :param title: title describing the barplot
    :type title: str
    :returns: a barplot with percentage distribution of classes in the dataset
    :rtype: px.bar
    '''
    lst = df[df.columns[-1]].unique()
    new_lst = [df[df[df.columns[-1]] == value].count()['post']/ len(df) * 100 for value in lst]
    df1 = pd.DataFrame(data = {'count': new_lst, df.columns[-1]: lst})

    fig = px.bar(df1, x = df.columns[-1], y = 'count', 
                    labels={
                            df.columns[-1]: x_label,
                            "count": y_label,
                            "species": lst
                            }
                    )
    fig.update_layout(title_text = title)
    fig.show();
    return 

In [387]:
# Reading datasets
gender_df = pd.read_csv('gender.csv')

jud_per_df = pd.read_csv('judging_perceiving.csv')

political_df  = pd.read_csv('political_leaning.csv')

# Exploratory Data Analysis

In [388]:
gender_df.head()

Unnamed: 0,auhtor_ID,post,female
0,t2_rnjzutp,Good on you for being responsible! I know self...,1
1,t2_rnjzutp,"must go to the grocery store with their child,...",1
2,t2_rnjzutp,"things on her videos, and YouTube took the vid...",1
3,t2_rnjzutp,their app. There's also a program called SYNC ...,1
4,t2_rnjzutp,"side. If the cops don't take your side, you'll...",1


In [389]:
jud_per_df.head()

Unnamed: 0,auhtor_ID,post,judging
0,t2_yo59v,"Oh...but why? Seriously, even corrected it's s...",0
1,t2_yo59v,could make out the address the second time and...,0
2,t2_yo59v,That's what he said yeah. But no he was genuin...,0
3,t2_yo59v,"here. I feel like he knows, with this wording ...",0
4,t2_yo59v,time it picked up steam lots of (ex)slaves got...,0


In [390]:
political_df.head()

Unnamed: 0,auhtor_ID,post,political_leaning
0,t2_7ramzeng,"You can ""buy"" the show and stream it through t...",right
1,t2_7ramzeng,"me want to play Q*bert Holy shit, based Alex J...",right
2,t2_7ramzeng,Shouldn't rely on any external services or per...,right
3,t2_7ramzeng,PR to a specific person. Usually that just mea...,right
4,t2_7ramzeng,This article's intention is clear that they wa...,right


In [391]:
# Removing irrelevant columns
gender_df = gender_df.drop('auhtor_ID', axis = 1)
political_df = political_df.drop('auhtor_ID', axis = 1)
jud_per_df = jud_per_df.drop('auhtor_ID', axis = 1)

In [392]:
# Removing duplicates
gender_df = gender_df[gender_df.duplicated() == False]
political_df = political_df[political_df.duplicated() == False]
jud_per_df = jud_per_df[jud_per_df.duplicated() == False]

: 

: 

In [None]:
# Checking for missing values
missing_values(gender_df)
missing_values(political_df)
missing_values(jud_per_df)

There are no missing values!
There are no missing values!
There are no missing values!


In [None]:
# write a function which returns the number of documents in the dataset
num_docs_gender = len(gender_df)
num_docs_jud_per = len(jud_per_df)
num_docs_polit = len(political_df)

In [None]:
# write a function which returns the number of categories in the dataset
num_cat_gender = num_of_cat(gender_df)
num_cat_jud_per = num_of_cat(jud_per_df)
num_cat_polit = num_of_cat(political_df)

In [None]:
# write a function which returns the number of words in the dataset
num_words_gender = num_of_words(gender_df)
num_words_jud_per = num_of_words(jud_per_df)
num_words_polit = num_of_words(political_df)

In [None]:
# Write a function which returns the number of characters in the dataset
num_cars_gender = num_of_cars(gender_df)
num_cars_jud_per = num_of_cars(jud_per_df)
num_cars_polit = num_of_cars(political_df)

In [None]:
# write a function which returns the average number of words per document
avg_num_words_gender = avg_num_of_words(gender_df)
avg_num_words_jud_per = avg_num_of_words(jud_per_df)
avg_num_words_polit = avg_num_of_words(political_df)

In [None]:
# write a function which returns the average number of characters per document
avg_num_cars_gender = avg_num_of_cars(gender_df)
avg_num_cars_jud_per = avg_num_of_cars(jud_per_df)
avg_num_cars_polit = avg_num_of_cars(political_df)

In [None]:
data = {'Dataset': ['gender', 'judging_perceiving', 'political_leaning'],
        "# of documents": [num_docs_gender, num_docs_jud_per, num_docs_polit], 
        '# of categories': [num_cat_gender, num_cat_jud_per, num_cat_polit],
        '# of words': [num_words_gender, num_words_jud_per, num_words_polit], 
        '# of characters': [num_cars_gender, num_cars_jud_per, num_cars_polit],
        "avg # of words": [avg_num_words_gender, avg_num_words_jud_per, avg_num_words_polit],
        "avg # of characters": [avg_num_cars_gender, avg_num_cars_jud_per, avg_num_cars_polit]}

results = pd.DataFrame(data).set_index('Dataset')
results

Unnamed: 0_level_0,# of documents,# of categories,# of words,# of characters,avg # of words,avg # of characters
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
gender,44635,2,65766063,357328357,1473,8006
judging_perceiving,41360,2,60998309,338109027,1475,8175
political_leaning,57184,3,84683868,475882069,1481,8322


# Some graphics for in data classes distribution visualizatoion

In [None]:
return_vis(gender_df, 
           'Gender', 
           'Percentage', 
           'Data Data distribution in gender dataset (before random undersampling)')

In [None]:
return_vis(jud_per_df, 
           'Judgment or percievment', 
           'Percentage', 
           'Data Data distribution in judging percieving dataset (before random undersampling)')

In [None]:
return_vis(political_df, 
           'Political leaning', 
           'Percentage', 
           'Data distribution in political leaning dataset (before random undersampling)')

# Undersampling imbalanced data

In [None]:
# Undersampling political_df
minority_class = political_df[political_df['political_leaning'] == 'left']
majority_class1 = political_df[political_df['political_leaning'] == 'center']
majority_class2 = political_df[political_df['political_leaning'] == 'right']

majority_class1 = shuffle(majority_class1)[:len(minority_class)]
majority_class2 = shuffle(majority_class2)[:len(minority_class)]

undersampled_political_df = shuffle(pd.concat([minority_class, majority_class1, majority_class2])).reset_index(drop = True)

In [None]:
# Undersampling judging percieving dataset
minority_class = jud_per_df[jud_per_df['judging'] == 0]
majority_class = jud_per_df[jud_per_df['judging'] == 1]

majority_class = shuffle(majority_class)[:len(minority_class)]

undersampled_jud_per_df = shuffle(pd.concat([minority_class, majority_class])).reset_index(drop = True)

# Visualizing the undersampled data

In [None]:
return_vis(undersampled_jud_per_df, 
           'Judgment or Percievment', 
           'Percentage', 
           'Data distribution in political leaning dataset (after random undersampling)')

In [None]:
return_vis(undersampled_political_df, 
           'Political leaning', 
           'Percentage',
           'Data distribution in political leaning dataset (after undersampling)')