# spaCy on NoSleep2020

In [1]:
import spacy
import tensorflow as tf
import pandas as pd
import numpy as np

In [2]:
nosleep2020 = pd.read_csv('./Creepy Data/NoSleep/RS_2020_nosleep.csv')

In [3]:
df = nosleep2020[['title', 'selftext', 'score']]
df = df[df.selftext != '[removed]']
df = df[df.selftext != '[deleted]']
df.dropna(subset = ["selftext"], inplace=True)
df = df.replace(r'^\s+','', regex=True) 

## Try out spaCy

In [4]:
nlp = spacy.load('en_core_web_lg')
doc = nlp('I am a man...')

In [5]:
for token in doc:
    print(token.text, token.pos_, token.i, token.similarity(nlp('male')[0]))

I PRON 0 0.23022383
am AUX 1 0.19199951
a DET 2 0.29643303
man NOUN 3 0.47811988
... PUNCT 4 0.16771694


In [6]:
df

Unnamed: 0,title,selftext,score
0,Do NOT Open Your Eyes... (Pt. 1),This is the only rule of our household. If you...,1
1,Do NOT open your eyes. (The Beginning),This is the only rule of our household. If you...,1
3,My Best Friend Saw Bugs Under His Skin,It is hard for me to talk about my old friend ...,1
5,"I picked up a hitchhiker by mistake, now he's ...",They say the devil is in the details. Well th...,1
6,I'm tasked with killing nameless things out in...,“Any sign of ‘em yet?” \n\nI continued staring...,1
...,...,...,...
21218,Hylophobia,*There is no cure for trauma. Once it enters t...,1
21219,I adopted my late sisters orphaned child. This...,"I knew Persephone would need time to adjust, b...",1
21221,My first paranormal experience!!,"This isnt much, but this is surely the first u...",1
21223,I met the demon under my bed... Its not what I...,"Okay. for context, this story started about a ...",1


In [7]:
df.loc[124:140,'selftext']

124    The following events happened during the early...
125    Do you ever get that feeling…?\n\nYou know, th...
126    "Dad, are we poor?"\n\nThat's a hot knife to t...
130    Todd meandered his way all over town. He was o...
131    My wife was acting weird and I finally found o...
133    \_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_...
Name: selftext, dtype: object

## Remove Stop Words

In [8]:
def removeSW(df):
    df = df.assign(filtered_selftext='')
    for index, row in df.iterrows():
        try:
            doc = nlp(row['selftext'])
        except TypeError:
            print(row)

        # Create list of word tokens
        token_list = []
        for token in doc:
            token_list.append(token.text)

        from spacy.lang.en.stop_words import STOP_WORDS

        # Create list of word tokens after removing stopwords
        filtered_sentence =[] 

        for word in token_list:
            lexeme = nlp.vocab[word]
            if lexeme.is_stop == False:
                filtered_sentence.append(word) 
        df.loc[index, 'filtered_selftext'] = ' '.join(filtered_sentence)
    return df

In [9]:
removeddf = removeSW(df.head(1000))
# df.head(1000)

## Find the maximum similarity between each text and a certain word

In [10]:
def find_max_sim(df, token2):
#     df = df.assign(token2=0)
    token2nlp = nlp(token2)
    maxsim = 0
    
    for index, row in df.iterrows():
        doc = nlp(row['filtered_selftext'])

        # this is make sure a word has vector norm
        if(token2nlp and token2nlp.vector_norm):
            for token in doc:
                if(token and token.vector_norm):
                    if token.similarity(token2nlp) > maxsim:
                        maxsim = token2nlp.similarity(token)
        df.loc[index, token2] = maxsim
    return df

In [11]:
def list_find_max_sim(df, li = None):
    for el in li:
        newdf = find_max_sim(df, el)
    return newdf
    

In [12]:
data = list_find_max_sim(removeddf, [u'human', u'female', u'male', u'creepy'])
%time

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 4.77 µs


## Train model using SVM.

In [154]:
import numpy as np
import matplotlib.pyplot as plt

In [155]:
tr_data = data[[u'human', u'female', u'male', u'creepy']]
tr_target = data[['score']]

In [159]:
tr_data

Unnamed: 0,human,female,male,creepy
0,0.575584,0.396472,0.383819,0.539645
1,0.575584,0.396472,0.383819,0.539645
3,0.575584,0.576211,0.540150,0.758441
5,1.000000,0.576211,0.540150,1.000000
6,1.000000,0.576211,0.540150,1.000000
...,...,...,...,...
1418,1.000000,1.000000,1.000000,1.000000
1420,1.000000,1.000000,1.000000,1.000000
1422,1.000000,1.000000,1.000000,1.000000
1424,1.000000,1.000000,1.000000,1.000000


In [168]:
from sklearn.svm import SVC
clf = SVC(kernel = 'linear')
clf.fit(tr_data[:900], tr_target[:900].values.ravel())

SVC(kernel='linear')

In [190]:
tr_data.iloc[0:2]

Unnamed: 0,human,female,male,creepy
0,0.575584,0.396472,0.383819,0.539645
1,0.575584,0.396472,0.383819,0.539645


In [191]:
def accuracy_after_train_on_first_n_test_on_rest(n):
    clf = SVC(kernel = 'linear')
    
    try:
        clf.fit(tr_data.iloc[:n], tr_target.iloc[:n].values.ravel())
    except:
        pass
    
    predict_targs = clf.predict(tr_data.iloc[n:])
    correct_targs = tr_target.iloc[n:]
    
    return 100 * np.count_nonzero(predict_targs == correct_targs) / len(correct_targs)


n_vals = range(10, len(data), 10)
accuracy_vals = [accuracy_after_train_on_first_n_test_on_rest(n) for n in n_vals]

plt.plot(n_vals, accuracy_vals)

AttributeError: 'SVC' object has no attribute 'shape_fit_'