# Stage Four: Vector Space Model

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from nltk.tokenize import word_tokenize
import textstat

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
import utilities as ut


In [4]:
training_path = '../data/WikiLarge_Train.csv'

In [5]:
%%time
df = pd.read_csv(training_path)
df.head()

CPU times: user 411 ms, sys: 50.1 ms, total: 461 ms
Wall time: 460 ms


Unnamed: 0,original_text,label
0,There is manuscript evidence that Austen conti...,1
1,"In a remarkable comparative analysis , Mandaea...",1
2,"Before Persephone was released to Hermes , who...",1
3,Cogeneration plants are commonly found in dist...,1
4,"Geneva -LRB- , ; , ; , ; ; -RRB- is the second...",1


In [6]:
df.shape

(416768, 2)

In [7]:
# # for development purposes
# df_x = df.sample(frac=1, random_state=42)
# df_x.shape

In [8]:
text = list(df['original_text'])
text

["There is manuscript evidence that Austen continued to work on these pieces as late as the period 1809 â '' 11 , and that her niece and nephew , Anna and James Edward Austen , made further additions as late as 1814 .",
 "In a remarkable comparative analysis , Mandaean scholar Säve-Söderberg demonstrated that Mani 's Psalms of Thomas were closely related to Mandaean texts .",
 'Before Persephone was released to Hermes , who had been sent to retrieve her , Hades tricked her into eating pomegranate seeds , -LRB- six or three according to the telling -RRB- which forced her to return to the underworld for a period each year .',
 'Cogeneration plants are commonly found in district heating systems of cities , hospitals , prisons , oil refineries , paper mills , wastewater treatment plants , thermal enhanced oil recovery wells and industrial plants with large heating needs .',
 'Geneva -LRB- , ; , ; , ; ; -RRB- is the second-most-populous city in Switzerland -LRB- after Zürich -RRB- and is th

## Define the Vector Space

In [9]:
vec = TfidfVectorizer()

In [10]:
%%time
# fit the vectorizer on the corpus
# text = list(df['original_text'])
t_text = vec.fit_transform(text)

CPU times: user 4.52 s, sys: 60 ms, total: 4.58 s
Wall time: 4.59 s


In [11]:
df['transformed_text'] = t_text

df.head()

Unnamed: 0,original_text,label,transformed_text
0,There is manuscript evidence that Austen conti...,1,"(0, 1710)\t0.22163597095992751\n (0, 6179)\..."
1,"In a remarkable comparative analysis , Mandaea...",1,"(0, 1710)\t0.22163597095992751\n (0, 6179)\..."
2,"Before Persephone was released to Hermes , who...",1,"(0, 1710)\t0.22163597095992751\n (0, 6179)\..."
3,Cogeneration plants are commonly found in dist...,1,"(0, 1710)\t0.22163597095992751\n (0, 6179)\..."
4,"Geneva -LRB- , ; , ; , ; ; -RRB- is the second...",1,"(0, 1710)\t0.22163597095992751\n (0, 6179)\..."


In [12]:
# when to perform the vectorization of the mean? 

In [13]:
df.shape

(416768, 3)

In [14]:
df['label'].unique()

array([1, 0])

In [15]:
zeros = df[df['label'] == 0]
ones = df[df['label'] == 1]

### Class 0: No Simplification needed

In [16]:
zero_text = list(zeros['original_text'])
# zero_text

In [17]:
# matrix of zero class transformed vectors 
matrix_tz = vec.transform(zero_text)
matrix_tz.shape

(208384, 142341)

In [18]:
mean_tz = np.mean(matrix_tz, axis=0)

### Class 1: Needs Simplification

In [19]:
ones_text = list(ones['original_text'])

In [20]:
matrix_ones = vec.transform(ones_text)
matrix_ones.shape

(208384, 142341)

In [21]:
mean_t_one = np.mean(matrix_ones, axis=0)

## Prediction by cosine similarity

In [22]:
# down sample the dataframe to make the operation manageable 
df_small = df.sample(frac=0.25, replace=False, random_state=42 )

In [23]:
df_small.shape

(104192, 3)

In [24]:
def assign_similarity(transformed_vector, mean_transformed_zeros = mean_tz, mean_transformed_ones = mean_t_one):
    similar_zero = cosine_similarity(transformed_vector, mean_transformed_zeros)[0]
    similar_one = cosine_similarity(transformed_vector, mean_transformed_ones)[0]

    if similar_zero > similar_one:
        return 0
    else:
        return 1

In [25]:
%%time
df_small['predicted'] = df_small.apply(lambda x: assign_similarity(x['transformed_text']), axis=1)

CPU times: user 1h 54min 37s, sys: 11min 50s, total: 2h 6min 27s
Wall time: 2h 6min 31s


In [26]:
df_small.head()

Unnamed: 0,original_text,label,transformed_text,predicted
8521,Diego María de la Concepción Juan Nepomuceno E...,1,"(0, 1710)\t0.22163597095992751\n (0, 6179)\...",1
182810,Some of the 1930s trams are still in regular s...,1,"(0, 1710)\t0.22163597095992751\n (0, 6179)\...",1
275464,Emperor Go-Momozono -LRB- Japan -RRB-,0,"(0, 1710)\t0.22163597095992751\n (0, 6179)\...",1
176814,"In other countries , potassium iodate is used ...",1,"(0, 1710)\t0.22163597095992751\n (0, 6179)\...",1
196293,"Located in a region called Planalto Central , ...",1,"(0, 1710)\t0.22163597095992751\n (0, 6179)\...",1


In [27]:
y_true = df_small['label']
y_pred = df_small['predicted']

In [28]:
accuracy_score(y_true, y_pred)

0.5010269502457002

In [30]:
# 10% of data 0.5016435923890875
# 25% of data 0.5010269502457002 @ time total: 2h 6min 27s