In [73]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from subprocess import check_output
%matplotlib inline
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import os
import gc

import re

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import MultinomialNB

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score


from wordcloud import WordCloud, STOPWORDS

In [74]:
df = pd.read_csv('reddit_relationship.csv', index_col = 0)

In [75]:
df.body[0]

"My post was capped because I didn't have a throwaway account but I'd like some advice:\n\nSo I'm (23M) getting married to my GF (23F) early next year if all goes to plan. I don't have a lot of family (I'm no contact with my dad for several years now.) She, however, has a huge extended family.\n\nFor the most part we get on great. They tend to be very loud and have huge personalities, which is sometimes hard for me to deal with since I've always been on the quiet side. But overall very warm and caring people.\n\nSomething weird I've noticed since we got engaged is that my GF's cousins sometimes make jokes about our wedding night. It's weird for them to joke about it since I'm pretty sure her family is aware we've slept together before since we share a bed at holiday gatherings and vacations.\n\nA few nights ago we were discussing wedding plans and thinking about places to go on our honeymoon (if COVID-19 clears up and we can travel by then). My GF and I were brainstorming ideas when sh

### Natural Language Processing
- What is tfidf?

- How to feature engineer in sklearn for text data?

- Use naive bayes to make classification on engineered text features

In [76]:
import string

def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    # text = text.lower()
    text = re.sub('\[.*?\]', ' ', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', ' ', text)
    text = re.sub('\n', ' ', text)
    text = re.sub('  ', ' ', text)
    return text

round1 = lambda x: clean_text_round1(x)

In [77]:
df.body = pd.DataFrame(df.body.apply(round1))

In [78]:
df.body[0]

'My post was capped because I didnt have a throwaway account but Id like some advice So Im  getting married to my GF  early next year if all goes to plan I dont have a lot of family Im no contact with my dad for several years now She however has a huge extended family For the most part we get on great They tend to be very loud and have huge personalities which is sometimes hard for me to deal with since Ive always been on the quiet side But overall very warm and caring people Something weird Ive noticed since we got engaged is that my GFs cousins sometimes make jokes about our wedding night Its weird for them to joke about it since Im pretty sure her family is aware weve slept together before since we share a bed at holiday gatherings and vacations A few nights ago we were discussing wedding plans and thinking about places to go on our honeymoon if  clears up and we can travel by then My GF and I were brainstorming ideas when she tells me that we wont need a bridal suite right away Obv

## Tokenizing the words in the body

In [79]:
def tokenize(text):
    text = [word_tokenize(x) for x in text]
    return text

In [80]:
df.body = tokenize(df.body)

## Removing Stop Words

In [81]:
stopwords_list = stopwords.words('english')
stopwords_list += ["tldr"]

In [82]:
stopwords_list

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [85]:
df.body = df['body'].apply(lambda x: [item for item in x if item not in stopwords_list])

## TF-IDF 

In [86]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer(stop_words = stopwords_list)
text_tf = tf.fit_transform(df['body'])
text_tf.data

AttributeError: 'list' object has no attribute 'lower'

In [26]:
text_tf.data.max()

NameError: name 'text_tf' is not defined

# Naive-Bayes Model

In [89]:
df.body = df['body'].apply(lambda x: ' '.join(x))

In [90]:
df.body

0      M       p       w       c   p   p   e       b ...
1      I       n       n       r   e   l   l       h ...
2      N   e           w   f   e           n       B ...
3      T   h   r   w   w       b   e   c   u   e     ...
4      H   e   r   e       n       u   p   e       f ...
                             ...                        
255    T   h   r   w   w       c   c   u   n       k ...
256    I       l   v   e           w   e       ’     ...
257    I       f   e   e   l       l   k   e       I ...
258    H   e   l   l       e   v   e   r   n   e     ...
259    A   b   u           w   e   e   k       g     ...
Name: body, Length: 260, dtype: object

In [56]:
X = df.body
y = df.subreddit

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=20)

tfidf = TfidfVectorizer(stop_words = stopwords_list)
tfidf_data_train = tfidf.fit_transform(X_train)
tfidf_data_test = tfidf.transform(X_test)
tfidf_data_train

<208x5296 sparse matrix of type '<class 'numpy.float64'>'
	with 27789 stored elements in Compressed Sparse Row format>

In [58]:
nb_classifier = MultinomialNB()

nb_classifier.fit(tfidf_data_train, y_train)

nb_test_preds = nb_classifier.predict(tfidf_data_test)

In [59]:
print(accuracy_score(y_test, nb_test_preds))

0.5961538461538461


In [60]:
confusion_matrix(y_test, nb_test_preds) 

array([[28,  4],
       [17,  3]])

## Random-Forest

In [61]:
rf_classifier = RandomForestClassifier(n_estimators = 250, random_state = 0, max_depth = 6)
rf_classifier.fit(tfidf_data_train, y_train)
rf_test_preds = rf_classifier.predict(tfidf_data_test)

In [62]:
print(accuracy_score(y_test, rf_test_preds))

0.6153846153846154


In [63]:
confusion_matrix(y_test, rf_test_preds)

array([[28,  4],
       [16,  4]])