# Project 3: Pushshift's API WebScrapping for Predicting Comments and Modeling

In [1]:
import requests
# importing Necessary Library
import pandas as pd
import numpy as np
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt
import requests
import json
import time
import datetime as dt
import re
from bs4 import BeautifulSoup

import string
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV


# setting the visual
sns.set_style('whitegrid')
%config InlineBackend.figure_format = 'retina'
%matplotlib inline     

## 1 Load the Data
#### Load in the the data of scraped results

In [2]:
# checking the easist way of looking the files you have in the directory
import os
print(os.listdir())

['chemistry_coms-pushshift.csv', 'comments_clean.csv', '.DS_Store', 'chemistry_subs-pushshift.csv', 'Requirements.txt', 'README.md', 'comments.csv', 'physics_coms-pushshift.csv', 'Cleaning_Dereje_starter-code.ipynb', 'physics_subs-pushshift.csv', 'Webscraping_Dereje_starter-code-.ipynb', '.ipynb_checkpoints', 'Dereje_Project_3 Presentation .pdf', 'Dere_starter-code.ipynb', 'Modeling_NLP_Dereje_starter-code.ipynb']


In [3]:
# Lets load the Concatenated DataFrame and the engine specification will help to delimit the outliers
df = pd.read_csv('comments.csv', engine='python')

In [4]:
df['subreddit'].value_counts()

chemistry    8356
Physics      6012
Name: subreddit, dtype: int64

##### 2.1 Checking the DataFrame basic Format and columns

In [5]:
df.shape

(14368, 2)

In [6]:
df.head(10)

Unnamed: 0,body,subreddit
0,You deserve the dad of the year award!,Physics
1,"I'm a layman and I found his article exciting,...",Physics
2,"Oh mah god, the article was worth it just for ...",Physics
3,I learned in third year Physics that the reaso...,Physics
4,The only right and moral approach is to give u...,Physics
5,Excellent article!,Physics
6,Hi again. Do you have a blog by any chance in ...,Physics
7,I'm not gatekeeping science. Wolfram chooses n...,Physics
8,Thanks!,Physics
9,This is a presentation by Prof. Barry Barish w...,Physics


In [7]:
# The Duplicates have diffrent time fetched the file
#df.sort_values(by=['num_comments'],inplace =True)
#df.head()

##### 2.2. Determining Missing observations

In [8]:
# Checking missing values
df.isnull().sum().sum()

0

In [9]:
# checking the null values in the DF , according to Ben this is the fancy one
(df.isnull().sum() / df.shape[0]).sort_values(ascending=False)   # credit Ben shaver

subreddit    0.0
body         0.0
dtype: float64

In [10]:
#df['id'].value_counts()    # this will help us to see if there is repetition on the titles
# can also show how many times we have scrapped  from this id of the reddit
# However, this i found out there are multiple topics

## Label target value

In [11]:
# Target is 'subreddit'. Current values: 'Physics', 'chemistry'.
# Need to transform to 0 or 1 values. Set physics = 1, chemistry = 0

df['target'] = df['subreddit'].map({'Physics': 1, 'chemistry': 0})
df.drop('subreddit', axis=1, inplace=True)
df.head()

Unnamed: 0,body,target
0,You deserve the dad of the year award!,1
1,"I'm a layman and I found his article exciting,...",1
2,"Oh mah god, the article was worth it just for ...",1
3,I learned in third year Physics that the reaso...,1
4,The only right and moral approach is to give u...,1


## Drop duplicates

In [12]:
# there are some duplicate comments, mostly automated comments by moderators

df.duplicated().sum()

666

In [13]:
df[df.duplicated()].head()

Unnamed: 0,body,target
95,[removed],1
102,[deleted],1
119,[deleted],1
145,[deleted],1
329,Okay.,1


In [14]:
df[df.duplicated()].tail()

Unnamed: 0,body,target
14240,Spread them out and spray some water on them w...,0
14264,"You have heating sources, and judging by the s...",0
14304,I did some research before I graduated but I a...,0
14319,"Ask homework, exam, and lab questions at [Chem...",0
14361,[deleted],0


In [15]:
physicsmodpost = list(df[df.duplicated()]['body'])[0]
physicsmodpost

'[removed]'

In [16]:
# number of duplicate physics mod posts
len(df[df['body'] == physicsmodpost])

261

In [17]:

chemistrymodpost = list(df[df.duplicated()]['body'])[-1]
chemistrymodpost

'[deleted]'

In [18]:
# number of duplicate chemistry mod posts
len(df[df['body'] == chemistrymodpost])

157

In [19]:
# look at other duplicate posts
df[df.duplicated() & (df['body'] != physicsmodpost) & (df['body'] != chemistrymodpost)].head(10)

Unnamed: 0,body,target
329,Okay.,1
330,Okay.,1
696,Thank you!,1
737,Thanks,1
762,"I think only for a while ""These books will be ...",1
815,Nice,1
923,Me too!,1
958,Yes!,1
1081,Yes.,1
1194,Thank you!,1


In [20]:
# look at other duplicate posts
df[df.duplicated() & (df['body'] != physicsmodpost) & (df['body'] != chemistrymodpost)].tail(10)

Unnamed: 0,body,target
14170,Thanks,0
14185,Kind of hard to beat cellulose / charcoal,0
14187,Kind of hard to beat cellulose charcoal,0
14190,Received date is not such a problem what is no...,0
14192,"On a positive note, the received date is on th...",0
14193,Thanks!,0
14240,Spread them out and spray some water on them w...,0
14264,"You have heating sources, and judging by the s...",0
14304,I did some research before I graduated but I a...,0
14319,"Ask homework, exam, and lab questions at [Chem...",0


In [21]:
df.shape

(14368, 2)

In [22]:
df.drop_duplicates(inplace=True)

In [23]:
df.shape

(13702, 2)

In [24]:
df.tail()


Unnamed: 0,body,target
14363,That set up is not proper for the safe distill...,0
14364,Yeah 200ml of alcohol.... Jesus Christ,0
14365,"Hi Guys, I am trying to play my role under cov...",0
14366,"I work with pure lithium and sodium, it’s stor...",0
14367,Cobalt is another odd duck that plays a bigger...,0


In [25]:
df = df.reset_index(drop=True)
df.tail()

Unnamed: 0,body,target
13697,That set up is not proper for the safe distill...,0
13698,Yeah 200ml of alcohol.... Jesus Christ,0
13699,"Hi Guys, I am trying to play my role under cov...",0
13700,"I work with pure lithium and sodium, it’s stor...",0
13701,Cobalt is another odd duck that plays a bigger...,0


## Clean Text

In [26]:
# some of these are redundant with the default functions of CountVectorizer but that's OK

def cleaner(text):
    # Make lowercase
    text = text.lower()

    # Remove HTML special entities (e.g. &amp;)
    text = re.sub(r'\&\w*;', '', text)
    
    # Remove hyperlinks
    text = re.sub(r'https?:\/\/.*\/\w*', '', text)
    
    # Remove punctuation and split 's, 't, 've with a space for filter
    text = re.sub(r'[' + string.punctuation.replace('@', '') + ']+', ' ', text)
    
    # Remove words with 2 or fewer letters
    text = re.sub(r'\b\w{1,2}\b', '', text)
    
    # Remove whitespace (including new line characters)
    text = re.sub(r'\s\s+', ' ', text)
    
    # Remove characters beyond Basic Multilingual Plane (BMP) of Unicode:
    text = ''.join(c for c in text if c <= '\uFFFF') 
    
    return text

# Adapted from GA office lecture and some modification

In [27]:
df['body'] = df['body'].apply(cleaner)

In [28]:
df.shape

(13702, 2)

In [29]:
# drop rows where body = ''
df = df[df['body'] != '']
df = df.reset_index(drop=True)

In [30]:
df.shape

(13663, 2)

In [31]:
df['target'].value_counts()

0    7996
1    5667
Name: target, dtype: int64

In [32]:
df['target'].value_counts(normalize = True)

0    0.58523
1    0.41477
Name: target, dtype: float64

In [33]:
print(df.median())      # median of number of comments
print(df.mean())        # mean of number of comments
df.describe()       # statistical description num-comments

target    0.0
dtype: float64
target    0.41477
dtype: float64


Unnamed: 0,target
count,13663.0
mean,0.41477
std,0.4927
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [34]:
# the describe function gives summary statistics for each variable
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
target,13663.0,0.41477,0.4927,0.0,0.0,0.0,1.0,1.0


In [35]:
#checking the number of unique bodies in the data set
len(df['body'].unique())

13483

In [36]:
#checking the number of unique titles in the data set
len(df['target'].unique())

2

## NLP pre-processing and exploration

### First let us lemmatize

In [37]:
lemmatizer = WordNetLemmatizer()

def lemmatize_words(text):
    words = text.split()
    lemma_words = ''
    for word in words:
        lemma_words += (lemmatizer.lemmatize(word) + ' ')
    return lemma_words

In [38]:
df['body'] = df['body'].apply(lemmatize_words)

In [39]:
df.shape

(13663, 2)

In [40]:
# drop rows where body = ''
df = df[df['body'] != '']
df = df.reset_index(drop=True)

In [41]:
df.shape

(13618, 2)

In [42]:
df.to_csv('comments_clean.csv', index=False)

### CountVectorizer

Let's check the most frequent physics words

In [43]:
 #Easy way to get most frequently used words: change max_features

count_vect = CountVectorizer(analyzer = "word", 
                             tokenizer = None, 
                             preprocessor = None,
                             stop_words = "english", 
                             max_features = 35) 

# input for CountVectorizer is an array of strings
vector_input_physics = df[df['target'] == 1]['body']

# fit_transform the vectorizer
physics_words = count_vect.fit_transform(vector_input_physics)

# convert output to a Numpy array
physics_words = physics_words.toarray()

In [44]:
# get the words
physics_word_list = count_vect.get_feature_names()
print(physics_word_list)

['book', 'doe', 'don', 'energy', 'field', 'force', 'good', 'ha', 'hole', 'just', 'know', 'light', 'like', 'look', 'lot', 'make', 'mass', 'mean', 'need', 'particle', 'people', 'physic', 'point', 'quantum', 'question', 'really', 'say', 'theory', 'thing', 'think', 'time', 'wa', 'want', 'way', 'work']


In [45]:
 #Easy way to get most frequently used words: change max_features

count_vect = CountVectorizer(analyzer = "word", 
                             tokenizer = None, 
                             preprocessor = None,
                             stop_words = "english", 
                             max_features = 36) 

# input for CountVectorizer is an array of strings
vector_input_chemistry = df[df['target'] == 0]['body']

# fit_transform the vectorizer
chemistry_words = count_vect.fit_transform(vector_input_chemistry)

# convert output to a Numpy array
chemistry_words = chemistry_words.toarray()

In [46]:
# get the words
chemistry_word_list = count_vect.get_feature_names()
print(chemistry_word_list)

['acid', 'chemical', 'chemistry', 'did', 'don', 'gas', 'good', 'ha', 'just', 'know', 'lab', 'like', 'look', 'lot', 'make', 'need', 'organic', 'people', 'probably', 'reaction', 'really', 'right', 'solution', 'sure', 'thanks', 'thing', 'think', 'time', 'use', 'used', 'wa', 'want', 'water', 'way', 'work', 'year']


In [47]:
from sklearn.feature_extraction import text

text.ENGLISH_STOP_WORDS

add_stop_words = ['did', 'doe', 'don', 'doesn', 'getting', 'going', 'got', 'ha', 'isn', 'wa']

stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

stop_words

frozenset({'a',
           'about',
           'above',
           'across',
           'after',
           'afterwards',
           'again',
           'against',
           'all',
           'almost',
           'alone',
           'along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'as',
           'at',
           'back',
           'be',
           'became',
           'because',
           'become',
           'becomes',
           'becoming',
           'been',
           'before',
           'beforehand',
           'behind',
           'being',
           'below',
           'beside',
           'besides'

### n-gram frequency:

In [48]:
# Instantiate the "CountVectorizer" object, which is scikit-learn's bag of words tool.
# CountVectorizer transforms the body text from the reddit comments into features (i.e. words)
# and creates columns (vectors) with word counts for each comment

count_vect = CountVectorizer(analyzer = "word", 
                             tokenizer = None, 
                             preprocessor = None,
                             stop_words = stop_words, 
                             max_features = 10000, 
                             ngram_range=(1, 3)
                            ) 

# input for CountVectorizer is an array of strings
vector_input_physics = df[df['target'] == 1]['body']

# fit_transform the vectorizer
physics_words = count_vect.fit_transform(vector_input_physics)

# convert output to a Numpy array
physics_words = physics_words.toarray()

In [49]:
physics_matrix = pd.DataFrame(physics_words, columns=count_vect.get_feature_names())

physics_matrix.sum().sort_values(ascending=False).head(10)

physic    1140
just      1064
like       906
time       883
know       649
think      648
energy     618
work       518
really     508
make       500
dtype: int64

In [50]:
physics_matrix.mean().sort_values(ascending=False).head(10)

physic    0.201841
just      0.188385
like      0.160411
time      0.156339
know      0.114908
think     0.114731
energy    0.109419
work      0.091714
really    0.089943
make      0.088527
dtype: float64

In [51]:
count_vect = CountVectorizer(analyzer = "word", 
                             tokenizer = None, 
                             preprocessor = None,
                             stop_words = stop_words, 
                             max_features = 10000, 
                             ngram_range=(1, 3)
                            ) 

# input for CountVectorizer is an array of strings
vector_input_chemistry = df[df['target'] == 0]['body']

# fit_transform the vectorizer
chemistry_words = count_vect.fit_transform(vector_input_chemistry)

# convert output to a Numpy array
chemistry_words = chemistry_words.toarray()

In [52]:
chemistry_matrix = pd.DataFrame(chemistry_words, columns=count_vect.get_feature_names())

chemistry_matrix.sum().sort_values(ascending=False).head(10)

just         1236
like         1104
chemistry     869
make          764
water         680
use           589
think         570
know          544
good          530
really        520
dtype: int64

## TF-IDF Vectorizer

In [53]:
tvec = TfidfVectorizer(analyzer = "word", 
                     stop_words = stop_words, 
                     max_features = 10000, 
                     ngram_range = (1, 3))

physics_tf_words = tvec.fit_transform(vector_input_physics)

physics_tf_words = physics_tf_words.toarray()

physics_matrix = pd.DataFrame(physics_tf_words, columns=tvec.get_feature_names())

physics_matrix.sum().sort_values(ascending=False).head(50)

physic         92.411993
just           89.731902
like           82.171611
thanks         79.288298
time           77.909034
thank          71.780039
think          69.016026
know           64.894098
work           56.470629
really         54.893780
good           52.582786
energy         52.291155
make           50.286004
mean           48.117055
yes            47.649357
force          47.235588
thing          45.775950
way            44.484612
theory         44.004079
quantum        43.147982
video          42.785147
people         40.909571
look           40.832873
book           40.601709
question       39.945590
need           39.007322
right          38.607341
point          38.153462
particle       37.788283
want           37.740596
lot            37.488950
say            36.875269
field          36.404930
mass           36.219828
idea           35.856920
sure           34.889708
light          33.717698
hole           33.680024
understand     33.400785
read           33.184875


In [54]:
tvec = TfidfVectorizer(analyzer = "word", 
                     stop_words = stop_words, 
                     max_features = 10000, 
                     ngram_range = (1, 3))

chemistry_tf_words = tvec.fit_transform(vector_input_chemistry)

chemistry_tf_words = chemistry_tf_words.toarray()

chemistry_matrix = pd.DataFrame(chemistry_tf_words, columns=tvec.get_feature_names())

chemistry_matrix.sum().sort_values(ascending=False).head(50)

just         126.118437
like         121.205708
thanks       111.327797
chemistry     95.897813
make          89.340642
thank         82.260279
water         75.965921
think         74.974083
know          73.167886
use           71.872094
good          71.527116
work          66.933165
really        66.323375
look          65.745814
yes           62.704541
reaction      60.132977
acid          59.698165
lab           58.549072
time          57.671816
want          56.136298
need          56.066624
used          55.866539
thing         53.877688
sure          52.732365
yeah          50.958590
probably      49.681627
chemical      49.399374
lot           48.919566
way           48.510996
right         47.435830
cool          44.116339
question      43.687698
nice          43.154307
say           42.074801
solution      41.913385
lol           41.602918
year          41.388212
help          40.815377
pretty        40.655688
maybe         39.958631
stuff         39.812805
organic       39

In [55]:
# checking the easist way of looking the files you have in the directory
#import os
#print(os.listdir())