In [161]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
import re

In [157]:
!pip install regex



In [101]:
Clipper_df = pd.read_csv('ClipperJuly9.csv')

In [102]:
Laker_df = pd.read_csv('LakerJuly9.csv')

## Check shape of df

In [103]:
Clipper_df.shape

(2464, 104)

In [104]:
Laker_df.shape

(2501, 102)

In [105]:
unique = Clipper_df.drop_duplicates(subset='selftext')

In [106]:
len(unique)

441

## Drop duplicate columns

In [107]:
Clipper_df.drop_duplicates(keep='first', inplace=True)

In [108]:
Clipper_df.shape

(2279, 104)

In [109]:
unique2 = Laker_df.drop_duplicates(subset='selftext')

In [110]:
len(unique2)

269

In [111]:
Laker_df.drop_duplicates(keep='first',inplace=True)

In [112]:
Laker_df.shape

(2030, 102)

In [113]:
Clipper_df.columns

Index(['all_awardings', 'allow_live_comments', 'approved_at_utc',
       'approved_by', 'archived', 'author', 'author_cakeday',
       'author_flair_background_color', 'author_flair_css_class',
       'author_flair_richtext',
       ...
       'thumbnail_width', 'title', 'total_awards_received', 'ups', 'url',
       'user_reports', 'view_count', 'visited', 'whitelist_status', 'wls'],
      dtype='object', length=104)

## Checking & dropping null/not useful cols

In [114]:
Clipper_df.isnull().sum()

all_awardings                       0
allow_live_comments                 0
approved_at_utc                  2279
approved_by                      2279
archived                            0
author                              0
author_cakeday                   2271
author_flair_background_color    2279
author_flair_css_class           1036
author_flair_richtext               0
author_flair_template_id         1231
author_flair_text                1036
author_flair_text_color          1036
author_flair_type                   0
author_fullname                     0
author_patreon_flair                0
banned_at_utc                    2279
banned_by                        2279
can_gild                            0
can_mod_post                        0
category                         2279
clicked                             0
content_categories               2279
contest_mode                        0
created                             0
created_utc                         0
crosspost_pa

In [115]:
Clipper_df.drop(columns = ['approved_at_utc','ups','wls','whitelist_status','thumbnail_height','thumbnail_width','spoiler',
               'report_reasons','removal_reason', 'secure_media','discussion_type','suggested_sort','crosspost_parent',
               'banned_by','category','author_cakeday','author_flair_background_color', 'all_awardings', 'allow_live_comments',
                          'approved_by','author_flair_css_class', 'author_flair_richtext','author_flair_template_id','author_flair_text',
                          'total_awards_received','user_reports','archived','author_flair_text_color','author_flair_type','author_patreon_flair',
                          'banned_at_utc','can_gild','can_mod_post','subreddit_type','subreddit_id','stickied','distinguished',
                          'created_utc','crosspost_parent_list','distinguished','created','view_count','visited','thumbnail',
                          'selftext_html','gilded','gildings','send_replies','content_categories','author_fullname','secure_media_embed','quarantine',
                          'is_crosspostable','hidden','contest_mode','hide_score','saved','post_hint','preview','pwls','is_reddit_media_domain','is_original_content',
                          'is_meta','is_robot_indexable','likes','link_flair_background_color','parent_whitelist_status','pinned','num_reports','over_18',
                          'is_video','downs','is_self','link_flair_css_class','link_flair_richtext','link_flair_template_id','link_flair_text','link_flair_text_color',
                          'media','media_embed','media_metadata','clicked','locked','num_crossposts','media_only','mod_note','mod_reason_by','mod_reports',
                          'no_follow','mod_reason_title','edited','link_flair_type','domain','id','name','subreddit_name_prefixed','permalink','url','subreddit_subscribers']
               ,inplace=True)

In [116]:
Clipper_df.columns

Index(['author', 'num_comments', 'score', 'selftext', 'subreddit', 'title'], dtype='object')

In [117]:
Laker_df.columns

Index(['all_awardings', 'allow_live_comments', 'approved_at_utc',
       'approved_by', 'archived', 'author', 'author_flair_background_color',
       'author_flair_css_class', 'author_flair_richtext',
       'author_flair_template_id',
       ...
       'thumbnail_width', 'title', 'total_awards_received', 'ups', 'url',
       'user_reports', 'view_count', 'visited', 'whitelist_status', 'wls'],
      dtype='object', length=102)

In [118]:
Laker_df.drop(columns = ['approved_at_utc','ups','wls','whitelist_status','thumbnail_height','thumbnail_width','spoiler',
               'report_reasons','removal_reason', 'secure_media','discussion_type','suggested_sort','crosspost_parent',
               'banned_by','category','author_flair_background_color', 'all_awardings', 'allow_live_comments',
                          'approved_by','author_flair_css_class', 'author_flair_richtext','author_flair_template_id','author_flair_text',
                          'total_awards_received','user_reports','archived','author_flair_text_color','author_flair_type','author_patreon_flair',
                          'banned_at_utc','can_gild','can_mod_post','subreddit_type','subreddit_id','stickied','distinguished',
                          'created_utc','crosspost_parent_list','distinguished','created','view_count','visited','thumbnail',
                          'selftext_html','gilded','gildings','send_replies','content_categories','author_fullname','secure_media_embed','quarantine',
                          'is_crosspostable','hidden','contest_mode','hide_score','saved','post_hint','preview','pwls','is_reddit_media_domain','is_original_content',
                          'is_meta','is_robot_indexable','likes','link_flair_background_color','parent_whitelist_status','pinned','num_reports','over_18',
                          'is_video','downs','is_self','link_flair_css_class','link_flair_richtext','link_flair_text','link_flair_text_color',
                          'media','media_embed','media_metadata','clicked','locked','num_crossposts','media_only','mod_note','mod_reason_by','mod_reports',
                          'no_follow','mod_reason_title','edited','link_flair_type','domain','id','name','subreddit_name_prefixed','permalink','url','subreddit_subscribers']
               ,inplace=True)

In [119]:
Laker_df.columns

Index(['author', 'num_comments', 'score', 'selftext', 'subreddit', 'title'], dtype='object')

In [120]:
Clipper_df.head()

Unnamed: 0,author,num_comments,score,selftext,subreddit,title
0,lolwtferic,1115,2285,"UM, HOLY FUCK.",LAClippers,[MEGATHREAD] KAWHI LEONARD AND PAUL GEORGE TO ...
1,apm_music,90,20,,LAClippers,[GAME THREAD - SUMMER LEAGUE] LA Clippers v. W...
2,iJoinedCuzFuckChuck,15,112,,LAClippers,"Stolen from twitter, had to post it here"
3,parkercola13,35,62,,LAClippers,Transcript from 2003 Donald Sterling depositio...
4,SHAMG0D,19,288,,LAClippers,"STRAP UP, WE HOME"


### null however we will keep

In [121]:
Clipper_df.isnull().sum()

author             0
num_comments       0
score              0
selftext        1238
subreddit          0
title              0
dtype: int64

## Fill null values with empty string

In [122]:
Clipper_df.fillna(" ", inplace=True)

In [123]:
Clipper_df.isnull().sum()

author          0
num_comments    0
score           0
selftext        0
subreddit       0
title           0
dtype: int64

In [124]:
Laker_df.isnull().sum()

author             0
num_comments       0
score              0
selftext        1407
subreddit          0
title              0
dtype: int64

In [125]:
Laker_df.fillna(" ", inplace=True)

In [126]:
Laker_df.isnull().sum()

author          0
num_comments    0
score           0
selftext        0
subreddit       0
title           0
dtype: int64

## Concat df's together

In [127]:
LA_df = pd.concat([Laker_df, Clipper_df], ignore_index=True)

In [128]:
LA_df.head()

Unnamed: 0,author,num_comments,score,selftext,subreddit,title
0,AutoModerator,33,10,Daily discussion about anything Lakers related...,lakers,Daily Lakers Discussion Thread - July 11
1,rickat99,63,877,,lakers,Whatever it takes
2,djmcc28,42,237,,lakers,Two GOATs 🐐🐐
3,Masicka636,17,154,,lakers,Our Boys!
4,daftmunt,250,2270,,lakers,"[Moreno] DeMarcus Cousins: ""My quad is 100% he..."


In [129]:
Laker_df.dtypes

author          object
num_comments     int64
score            int64
selftext        object
subreddit       object
title           object
dtype: object

## Interacte columns

In [176]:
LA_df['text'] = LA_df['selftext'] + ' ' + LA_df['title']

In [131]:
LA_df.shape

(4309, 7)

In [132]:
LA_df.head()

Unnamed: 0,author,num_comments,score,selftext,subreddit,title,selftext_title
0,AutoModerator,33,10,Daily discussion about anything Lakers related...,lakers,Daily Lakers Discussion Thread - July 11,Daily discussion about anything Lakers related...
1,rickat99,63,877,,lakers,Whatever it takes,Whatever it takes
2,djmcc28,42,237,,lakers,Two GOATs 🐐🐐,Two GOATs 🐐🐐
3,Masicka636,17,154,,lakers,Our Boys!,Our Boys!
4,daftmunt,250,2270,,lakers,"[Moreno] DeMarcus Cousins: ""My quad is 100% he...","[Moreno] DeMarcus Cousins: ""My quad is 100% ..."


## Import snowball to "clean data"

In [184]:
from nltk.stem.snowball import SnowballStemmer

In [185]:
tokenizer = RegexpTokenizer(r'\w+')

snow = SnowballStemmer(language = 'english')

In [187]:
#iterated on Erin's cod 
def snow_text(text):
    return [snow.stem(w.lower()) for w in tokenizer.tokenize(text.replace("'", ""))]  #add .replace("'", "")


LA_df['snow_text'] = LA_df['text'].apply(snow_text)

### Sanity check

In [190]:
LA_df.head()

Unnamed: 0,author,num_comments,score,selftext,subreddit,title,selftext_title,text,snow_text,texty
0,AutoModerator,33,10,Daily discussion about anything Lakers related...,lakers,Daily Lakers Discussion Thread - July 11,Daily discussion about anything Lakers related...,Daily discussion about anything Lakers related...,"[daili, discuss, about, anyth, laker, relat, o...",[daili discuss about anyth laker relat off top...
1,rickat99,63,877,,lakers,Whatever it takes,Whatever it takes,Whatever it takes,"[whatev, it, take]",[whatev it take]
2,djmcc28,42,237,,lakers,Two GOATs 🐐🐐,Two GOATs 🐐🐐,Two GOATs 🐐🐐,"[two, goat]",[two goat]
3,Masicka636,17,154,,lakers,Our Boys!,Our Boys!,Our Boys!,"[our, boy]",[our boy]
4,daftmunt,250,2270,,lakers,"[Moreno] DeMarcus Cousins: ""My quad is 100% he...","[Moreno] DeMarcus Cousins: ""My quad is 100% ...","[Moreno] DeMarcus Cousins: ""My quad is 100% ...","[moreno, demarcus, cousin, my, quad, is, 100, ...",[moreno demarcus cousin my quad is 100 heal my...


### New column with text join

In [189]:
LA_df['texty'] = [[' '.join(i)] for i in LA_df['snow_text']]

In [191]:
LA_df.head()

Unnamed: 0,author,num_comments,score,selftext,subreddit,title,selftext_title,text,snow_text,texty
0,AutoModerator,33,10,Daily discussion about anything Lakers related...,lakers,Daily Lakers Discussion Thread - July 11,Daily discussion about anything Lakers related...,Daily discussion about anything Lakers related...,"[daili, discuss, about, anyth, laker, relat, o...",[daili discuss about anyth laker relat off top...
1,rickat99,63,877,,lakers,Whatever it takes,Whatever it takes,Whatever it takes,"[whatev, it, take]",[whatev it take]
2,djmcc28,42,237,,lakers,Two GOATs 🐐🐐,Two GOATs 🐐🐐,Two GOATs 🐐🐐,"[two, goat]",[two goat]
3,Masicka636,17,154,,lakers,Our Boys!,Our Boys!,Our Boys!,"[our, boy]",[our boy]
4,daftmunt,250,2270,,lakers,"[Moreno] DeMarcus Cousins: ""My quad is 100% he...","[Moreno] DeMarcus Cousins: ""My quad is 100% ...","[Moreno] DeMarcus Cousins: ""My quad is 100% ...","[moreno, demarcus, cousin, my, quad, is, 100, ...",[moreno demarcus cousin my quad is 100 heal my...


### New col as a str

In [193]:
LA_df['string'] = [str(i) for i in LA_df['texty']]

In [194]:
LA_df.head()

Unnamed: 0,author,num_comments,score,selftext,subreddit,title,selftext_title,text,snow_text,texty,string
0,AutoModerator,33,10,Daily discussion about anything Lakers related...,lakers,Daily Lakers Discussion Thread - July 11,Daily discussion about anything Lakers related...,Daily discussion about anything Lakers related...,"[daili, discuss, about, anyth, laker, relat, o...",[daili discuss about anyth laker relat off top...,['daili discuss about anyth laker relat off to...
1,rickat99,63,877,,lakers,Whatever it takes,Whatever it takes,Whatever it takes,"[whatev, it, take]",[whatev it take],['whatev it take']
2,djmcc28,42,237,,lakers,Two GOATs 🐐🐐,Two GOATs 🐐🐐,Two GOATs 🐐🐐,"[two, goat]",[two goat],['two goat']
3,Masicka636,17,154,,lakers,Our Boys!,Our Boys!,Our Boys!,"[our, boy]",[our boy],['our boy']
4,daftmunt,250,2270,,lakers,"[Moreno] DeMarcus Cousins: ""My quad is 100% he...","[Moreno] DeMarcus Cousins: ""My quad is 100% ...","[Moreno] DeMarcus Cousins: ""My quad is 100% ...","[moreno, demarcus, cousin, my, quad, is, 100, ...",[moreno demarcus cousin my quad is 100 heal my...,['moreno demarcus cousin my quad is 100 heal m...


### Remove square brackets and apostrophe

In [196]:
LA_df['stringOG'] = [i.replace('[', '').replace(']', '').replace("'", '') for i in LA_df['string']]

In [197]:
LA_df.head()

Unnamed: 0,author,num_comments,score,selftext,subreddit,title,selftext_title,text,snow_text,texty,string,stringOG
0,AutoModerator,33,10,Daily discussion about anything Lakers related...,lakers,Daily Lakers Discussion Thread - July 11,Daily discussion about anything Lakers related...,Daily discussion about anything Lakers related...,"[daili, discuss, about, anyth, laker, relat, o...",[daili discuss about anyth laker relat off top...,['daili discuss about anyth laker relat off to...,daili discuss about anyth laker relat off topi...
1,rickat99,63,877,,lakers,Whatever it takes,Whatever it takes,Whatever it takes,"[whatev, it, take]",[whatev it take],['whatev it take'],whatev it take
2,djmcc28,42,237,,lakers,Two GOATs 🐐🐐,Two GOATs 🐐🐐,Two GOATs 🐐🐐,"[two, goat]",[two goat],['two goat'],two goat
3,Masicka636,17,154,,lakers,Our Boys!,Our Boys!,Our Boys!,"[our, boy]",[our boy],['our boy'],our boy
4,daftmunt,250,2270,,lakers,"[Moreno] DeMarcus Cousins: ""My quad is 100% he...","[Moreno] DeMarcus Cousins: ""My quad is 100% ...","[Moreno] DeMarcus Cousins: ""My quad is 100% ...","[moreno, demarcus, cousin, my, quad, is, 100, ...",[moreno demarcus cousin my quad is 100 heal my...,['moreno demarcus cousin my quad is 100 heal m...,moreno demarcus cousin my quad is 100 heal my ...


In [198]:
LA_df.to_csv('LA_df.csv', index=False)