# Cleaning

Import relevant packages:
- Here we will include the big three: pandas, numpy, and matplotlib
- BeautifulSoup for cleaning html artifacts from our data

In [1]:
import pandas as pd
import numpy as np
import regex as re
import matplotlib.pyplot as plt


from bs4 import BeautifulSoup

from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

from sklearn.svm import SVC

### Read in the Data

In [2]:
# Read in Machine learning and datascience data
mc = pd.read_csv('./data/machinelearning_1.csv')
ds = pd.read_csv('./data/datascience_1.csv')

In [3]:
# Concatenate the data
df = pd.concat([mc,ds])
df.reset_index(drop=True, inplace = True)

In [4]:
df.head()

Unnamed: 0,title,selftext,created_utc,num_comments,num_crossposts,score,subreddit
0,[D] Hinton responds to Schmidhuber,,1587609168,0,0,1,MachineLearning
1,Hinton responds to Schmidhuber,,1587609111,1,0,1,MachineLearning
2,"[D] Other than vectorization, what other aspec...",I'm helping a friend design a course with dual...,1587606108,2,0,1,MachineLearning
3,Survey for IT Employees working from home! Hel...,,1587604741,2,0,1,MachineLearning
4,[R] Chip Placement with Deep Reinforcement Lea...,,1587604558,1,0,1,MachineLearning


### Cleaning [deleted] and [removed] rows from title and selftext

There are still some remaining rows in title and selftext that have some deleted and removed rows.

Lets create a mask that looks for them and then drops them.

In [5]:
# Create a boolean mask for titles that are removed and deleted
title_removed = (df['title']=='[removed]')
title_deleted = (df['title']=='[deleted]')

# Create a boolean mask for subtexts that are removed and deleted
selftext_removed = (df['selftext']=='[removed]')
selftext_deleted = (df['selftext']=='[deleted]')

# Check if there are any removed or deleted values. 
#  Remove them

if (len(df[title_removed]) + len(df[title_deleted]) + 
     len(df[selftext_removed]) + len(df[selftext_deleted])) > 0:
    
    # Remove rows with '[deleted]' as the title or selftext
    df.drop(labels = df[selftext_deleted].index, axis = 0, inplace=True)
    df.reset_index(inplace=True)

### Clean out each review

In [6]:
# function that runs over reviews 
def review_to_words(raw_review):
    # Remove 
    review_text = BeautifulSoup(raw_review).get_text()
    letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    #text = re.sub(r'^https?:\/\/.*[\r\n]*', '', letters_only, flags=re.MULTILINE)
    text = re.sub(r'(\(https:\/\/[^\s]+)|https:\/\/[^\s]+', '', letters_only, flags=re.MULTILINE)
    words = text.lower().split()
    stops = set(stopwords.words('english'))
    meaningful_words = [w for w in words if w not in stops]
    return(" ".join(meaningful_words))

In [None]:
%%time

# Running through review_to_words to clean each row
df.insert(2,column = 'clean_title',values = [review_to_words(element) for element in df['title']])

In [66]:
raw_review = 'here is a sample URL https://lzone.de/examples/Python%20re.sub or how about this https://docs.python.org/3/library/re.html'
def review_to_url(raw_review):
    domain_string = ' '
    domains = []
    urls = re.findall('https:\/\/[^\s]+', raw_review)
    for url in urls:
        base_url = re.findall('^((http[s]?|ftp):\/)?\/?([^:\/\s]+)((\/\w+)*\/)([\w\-\.]+[^#?\s]+)(.*)?(#[\w\-]+)?$', url)
        a, b, domain, d, e, f, g, h = base_url[0]
        domains.append(domain)
    
    return domain_string.join(domains)



lzone.de docs.python.org


In [None]:
%%time 
for 

### Now we work on our data

In [None]:
df.shape

### Create a subreddit column

In [None]:
# Create a target variable of 0/1 in dataframe
df['Subreddit_name'] = [1 if element == 'datascience' else 0 for element in df['subreddit']]

In [None]:
df.to_csv('./data/clean.csv', index = False)

## MODELING

In [None]:
# Create X variable

X = df['clean_title']
y = df['Subreddit_name']

X_train, X_test, y_train, y_test = train_test_split(X,y, stratify=y)

In [None]:
cvec = CountVectorizer(stop_words = 'english', max_df = 0.8, max_features = 173)

cvec.fit(X_train,y_train)

In [None]:
C_train = cvec.transform(X_train)
transformed_train_df = pd.DataFrame(C_train.toarray(), 
                             columns = cvec.get_feature_names())


ss = StandardScaler()
ss.fit(transformed_train_df, y_train)
SS_train = ss.transform(transformed_train_df)

In [None]:
transformed_train_df = pd.DataFrame(C_train.toarray(), 
                             columns = cvec.get_feature_names())



In [None]:

tr = LogisticRegression()

tr.fit(transformed_train_df, y_train)
len(tr.coef_)

### SVC model



.

In [None]:
svc = SVC()
svc.fit(SS_train, y_train)
svc.score(SS_train, y_train)

In [None]:

cross_val_score()

In [None]:
len(tr.coef_[0])

## EDA

## Creating custom stop words

In [None]:
# sklearn's stopwords, extracted
sklearn_stopwords = list(CountVectorizer(stop_words = 'english').get_stop_words())

#Custom created list
custom_stopwords = ['good','time','python','tool','source','best','learn','science']

# Personalized stopwords
personal_stopwords = sklearn_stopwords + custom_stopwords

In [None]:
len(personal_stopwords)

In [None]:
# Instantiate Pipeline

C = np.logspace(-2,1,15)
pipe = Pipeline([
    ('vec',TfidfVectorizer()),
    ('lr',LogisticRegression())
])
# Define pipe parameters
pipe_params = {
    'vec__max_features':[173],
    'vec__max_df':[0.8],
    'vec__max_df':[0.8],
    'vec__stop_words': ['english'],
    'vec__ngram_range':[(1,1)],
    'lr__C': C
    #'lr__penalty':['l2']
}

# Instantiate Grid Search
gs = GridSearchCV(pipe, pipe_params, cv= 5)
gs.fit(X_train, y_train)
print(gs.score(X_train, y_train))
print(gs.score(X_test, y_test))
gs.best_params_

In [None]:
gs.estimator.named_steps

## TRYING SOME EDA

## Frequency of numerical data

In [None]:
df.groupby(by = 'subreddit').mean()

In [None]:
plt.hist(df['score'])

In [None]:
list_of_words = [len(element) for element in df['title']]

Here we will create a count vectorizer to conduct some EDA

In [None]:
cvec = CountVectorizer(stop_words = 'english', min_df=4, max_df = 1.0) #stop_words = 'english',

In [None]:
term_matrix = cvec.fit_transform(df['clean_title'])

In [None]:
len(cvec.get_feature_names())

In [None]:
# Create a dataframe with our term_matrix outputted from Count_vec
term_df = pd.DataFrame(term_matrix.toarray(), columns = cvec.get_feature_names())
# Lets insert our target as "Subreddit"
term_df.insert(0, 'Subreddit_name', df['Subreddit_name'])

In [None]:
term_df.groupby('Subreddit_name').mean().T.sort_values(1, ascending=False).head(100)

In [None]:
def plot_word_frequency(word, term_df):
''' 
    word (string): Any word that exists in the dataframe of term frequency
    term_df (dataframe): A dataframe that lists the word frequency of each word in two different corpuses.
    This is how you add document strings to your functions
'''
    term_df[term_df['Subreddit_name']==0]['word'].value_counts().to_dict() 
    

In [None]:
plot_word_frequency()

In [None]:
# Machine Learning frequency of "data"
term_df[term_df['Subreddit']==0]['data'].value_counts().to_dict() 

In [None]:
# Data science frequency of "data"
term_df[term_df['Subreddit']==1]['data'].value_counts().to_dict()

In [None]:

term_df[term_df['Subreddit']==0]['learning'].value_counts().to_dict()

In [None]:
term_df[term_df['Subreddit']==1]['learning'].value_counts().to_dict()

In [None]:
# Machine Learning frequency of "data"
term_df[term_df['Subreddit_name']==0]['data'].value_counts().to_dict() 

In [None]:
# Data science frequency of "data"
term_df[term_df['Subreddit_name']==1]['data'].value_counts().to_dict()

In [None]:

term_df[term_df['Subreddit_name']==0]['learning'].value_counts().to_dict()

In [None]:
term_df[term_df['Subreddit_name']==1]['learning'].value_counts().to_dict()

In [None]:
# Machine Learning frequency of "data"
term_df[term_df['Subreddit_name']==0]['help'].value_counts().to_dict() 

In [None]:
# Data science frequency of "data"
term_df[term_df['Subreddit_name']==1]['help'].value_counts().to_dict()

In [None]:

term_df[term_df['Subreddit_name']==0]['learning'].value_counts().to_dict()

In [None]:
term_df[term_df['Subreddit_name']==1]['learning'].value_counts().to_dict()

In [None]:
top_words_ML = list(term_df.groupby('Subreddit_name').
     mean().T.sort_values(0, ascending=False).head(250).index)

top_words_DS = list(term_df.groupby('Subreddit_name').
     mean().T.sort_values(1, ascending=False).head(250).index)

In [None]:
top_words_overlap = [element for element in top_words_DS if element in top_words_ML]

In [None]:
len(top_words_overlap)

And we could conduct a hypothesis test on this.

$H_0$: The subreddits for DS and ML have the same mean frequency for word $x$.

$H_A$: The subreddits for DS and ML have a different mean frequency for word $x$.

We'll set our alpha at .05

In [None]:
from scipy.stats import ttest_ind

In [None]:
ttest_dict = {}

# Cycle through each word in overlap list
for word in top_words_overlap:
    
    # Conduct a t-test, and append the result statistic
    ttest_dict[word] = ttest_ind(term_df[term_df['Subreddit_name']==1][word], # word count in DS
         term_df[term_df['Subreddit_name']==0][word]) # word count dist in ML
    
ttest_dict

In [None]:
# Create a dataframe to examine common top words and their p-values

ttest_df = pd.DataFrame([ttest_dict]).T.sort_values(0)
ttest_df['pvalue'] = [element.pvalue for element in ttest_dict.values()]
ttest_df['statistic'] = [element.statistic for element in ttest_dict.values()]
ttest_df.drop(columns = 0, inplace = True)

In [None]:
ttest_df.sort_values(by='pvalue')

In [None]:
def plot_cvec_dist(words, dataframe, target = 'Subreddit_name', classes = [0,1]):
    nrows = len(words)//2 if not len(words)%2 else len(words)//2 + 1
    class_0 = dataframe[dataframe[target]==classes[0]]
    class_1 = dataframe[dataframe[target]==classes[1]]
    fig, ax = plt.subplots(ncols=2, nrows=nrows, figsize=(20, 7*nrows))
    ax = ax.ravel()
    
    for i, word in enumerate(words):
        counts_0 = class_0[word].value_counts()[1:].to_dict()
        counts_1 = class_1[word].value_counts()[1:].to_dict()
        mean_0 = class_0[word].mean()
        mean_1 = class_1[word].mean()
        ax[i].bar(counts_0.keys(), counts_0.values(), color='goldenrod', alpha=.3)
        for keys, values in counts_0.items():
            ax[i].text(keys-.1, values, s=values, fontsize=14, color='goldenrod')
        for keys, values in counts_1.items():
            ax[i].text(keys+.1, values, s=values, fontsize=14, color='grey')
        ax[i].bar(counts_1.keys(), counts_1.values(), color='grey', alpha=.3)
        ymin, ymax = ax[i].get_ylim()
        ax[i].plot([mean_0]*2, [ymin, ymax], ':', color='goldenrod')
        ax[i].plot([mean_1]*2, [ymin, ymax], ':', color='grey')
        ax[i].set_title(f'{word} frequency counts\nmeans: {mean_0:0.02f} vs {mean_1:0.02f}')
        ax[i].legend(classes)

In [None]:
plot_cvec_dist(top_words_overlap, term_df)