# Exploratory Reddit Data Analysis

Import relevant packages. 
- We are going to need the big three: pandas, numpy, matplotlib. 
- We also will use nltk.corpus to examine stop words.
- From Sklearn, we will pull in
    - train_test_split, cross_val_score
    - Models:
        - LogisticRegression
        - NaiveBayes
        - SVC
        - RandomForestClassifier
        - BaggingClassifier

In [1]:
%%time
import pandas as pd
import numpy as np
import regex as re
import matplotlib.pyplot as plt


from bs4 import BeautifulSoup

from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

from sklearn.svm import SVC

CPU times: user 1.87 s, sys: 591 ms, total: 2.46 s
Wall time: 13.6 s


## Import Data
Here we are bringing in our cleaned dataset coming from P3

In [2]:
# Bring in the clean data
df = pd.read_csv('../data/clean.csv')

# View a preview
clean.head(2)

In [None]:
# Check our data
df.info()

In [None]:
if clean.clean_title.isna().sum() > 0:
    clean.drop(labels = clean[clean.clean_title.isna()].index, inplace = True)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39813 entries, 0 to 39812
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   title           39813 non-null  object
 1   selftext        23811 non-null  object
 2   clean_title     39783 non-null  object
 3   selftext_urls   8152 non-null   object
 4   title_urls      93 non-null     object
 5   clean_selftext  23806 non-null  object
 6   created_utc     39813 non-null  int64 
 7   num_comments    39813 non-null  int64 
 8   num_crossposts  39813 non-null  int64 
 9   score           39813 non-null  int64 
 10  subreddit       39813 non-null  object
 11  Subreddit_name  39813 non-null  int64 
 12  merged          39793 non-null  object
dtypes: int64(5), object(8)
memory usage: 3.9+ MB


In [None]:
df.isnull().sum()

## Creating custom stop words

In [None]:
# sklearn's stopwords, extracted
sklearn_stopwords = list(CountVectorizer(stop_words = 'english').get_stop_words())

#Custom created list
custom_stopwords = ['good','time','python','tool','source','best','learn','science']

# Personalized stopwords
personal_stopwords = sklearn_stopwords + custom_stopwords

## TRYING SOME EDA

## Frequency of numerical data

In [None]:
df.groupby(by = 'subreddit').mean()

In [None]:
plt.hist(df['score'])

In [None]:
list_of_words = [len(element) for element in df['title']]

Here we will create a count vectorizer to conduct some EDA

In [None]:
cvec = CountVectorizer(stop_words = 'english', min_df=4, max_df = 1.0) #stop_words = 'english',

In [None]:
term_matrix = cvec.fit_transform(df['clean_title'])

In [None]:
len(cvec.get_feature_names())

In [None]:
# Create a dataframe with our term_matrix outputted from Count_vec
term_df = pd.DataFrame(term_matrix.toarray(), columns = cvec.get_feature_names())
# Lets insert our target as "Subreddit"
term_df.insert(0, 'Subreddit_name', df['Subreddit_name'])

In [None]:
term_df.groupby('Subreddit_name').mean().T.sort_values(1, ascending=False).head(100)

In [None]:
def plot_word_frequency(word, term_df):
''' 
    word (string): Any word that exists in the dataframe of term frequency
    term_df (dataframe): A dataframe that lists the word frequency of each word in two different corpuses.
    This is how you add document strings to your functions
'''
    term_df[term_df['Subreddit_name']==0]['word'].value_counts().to_dict() 
    

In [None]:
#plot_word_frequency()

In [None]:
# Machine Learning frequency of "data"
term_df[term_df['Subreddit']==0]['data'].value_counts().to_dict() 

In [None]:
# Data science frequency of "data"
term_df[term_df['Subreddit']==1]['data'].value_counts().to_dict()

In [None]:

term_df[term_df['Subreddit']==0]['learning'].value_counts().to_dict()

In [None]:
term_df[term_df['Subreddit']==1]['learning'].value_counts().to_dict()

In [None]:
# Machine Learning frequency of "data"
term_df[term_df['Subreddit_name']==0]['data'].value_counts().to_dict() 

In [None]:
# Data science frequency of "data"
term_df[term_df['Subreddit_name']==1]['data'].value_counts().to_dict()

In [None]:

term_df[term_df['Subreddit_name']==0]['learning'].value_counts().to_dict()

In [None]:
term_df[term_df['Subreddit_name']==1]['learning'].value_counts().to_dict()

In [None]:
# Machine Learning frequency of "data"
term_df[term_df['Subreddit_name']==0]['help'].value_counts().to_dict() 

In [None]:
# Data science frequency of "data"
term_df[term_df['Subreddit_name']==1]['help'].value_counts().to_dict()

In [None]:

term_df[term_df['Subreddit_name']==0]['learning'].value_counts().to_dict()

In [None]:
term_df[term_df['Subreddit_name']==1]['learning'].value_counts().to_dict()

In [None]:
top_words_ML = list(term_df.groupby('Subreddit_name').
     mean().T.sort_values(0, ascending=False).head(250).index)

top_words_DS = list(term_df.groupby('Subreddit_name').
     mean().T.sort_values(1, ascending=False).head(250).index)

In [None]:
top_words_overlap = [element for element in top_words_DS if element in top_words_ML]

In [None]:
len(top_words_overlap)

And we could conduct a hypothesis test on this.

$H_0$: The subreddits for DS and ML have the same mean frequency for word $x$.

$H_A$: The subreddits for DS and ML have a different mean frequency for word $x$.

We'll set our alpha at .05

In [None]:
from scipy.stats import ttest_ind

In [None]:
ttest_dict = {}

# Cycle through each word in overlap list
for word in top_words_overlap:
    
    # Conduct a t-test, and append the result statistic
    ttest_dict[word] = ttest_ind(term_df[term_df['Subreddit_name']==1][word], # word count in DS
         term_df[term_df['Subreddit_name']==0][word]) # word count dist in ML
    
ttest_dict

In [None]:
# Create a dataframe to examine common top words and their p-values

ttest_df = pd.DataFrame([ttest_dict]).T.sort_values(0)
ttest_df['pvalue'] = [element.pvalue for element in ttest_dict.values()]
ttest_df['statistic'] = [element.statistic for element in ttest_dict.values()]
ttest_df.drop(columns = 0, inplace = True)

In [None]:
ttest_df.sort_values(by='pvalue')

In [None]:
def plot_cvec_dist(words, dataframe, target = 'Subreddit_name', classes = [0,1]):
    nrows = len(words)//2 if not len(words)%2 else len(words)//2 + 1
    class_0 = dataframe[dataframe[target]==classes[0]]
    class_1 = dataframe[dataframe[target]==classes[1]]
    fig, ax = plt.subplots(ncols=2, nrows=nrows, figsize=(20, 7*nrows))
    ax = ax.ravel()
    
    for i, word in enumerate(words):
        counts_0 = class_0[word].value_counts()[1:].to_dict()
        counts_1 = class_1[word].value_counts()[1:].to_dict()
        mean_0 = class_0[word].mean()
        mean_1 = class_1[word].mean()
        ax[i].bar(counts_0.keys(), counts_0.values(), color='goldenrod', alpha=.3)
        for keys, values in counts_0.items():
            ax[i].text(keys-.1, values, s=values, fontsize=14, color='goldenrod')
        for keys, values in counts_1.items():
            ax[i].text(keys+.1, values, s=values, fontsize=14, color='grey')
        ax[i].bar(counts_1.keys(), counts_1.values(), color='grey', alpha=.3)
        ymin, ymax = ax[i].get_ylim()
        ax[i].plot([mean_0]*2, [ymin, ymax], ':', color='goldenrod')
        ax[i].plot([mean_1]*2, [ymin, ymax], ':', color='grey')
        ax[i].set_title(f'{word} frequency counts\nmeans: {mean_0:0.02f} vs {mean_1:0.02f}')
        ax[i].legend(classes)

In [None]:
plot_cvec_dist(top_words_overlap, term_df)