# Stage Four: Vector Space Model

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from nltk.tokenize import word_tokenize
import textstat

In [2]:
from sklearn.model_selection import train_test_split
# from sklearn.decomposition import PCA
# from sklearn.neighbors import KNeighborsClassifier 
# from sklearn.cluster import KMeans
# from sklearn.metrics import accuracy_score
# from sklearn.pipeline import make_pipeline
# from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
import utilities as ut


In [4]:
training_path = '../data/WikiLarge_Train.csv'

In [9]:
%%time
df = pd.read_csv(training_path)
df.head()

CPU times: user 410 ms, sys: 36.7 ms, total: 447 ms
Wall time: 446 ms


Unnamed: 0,original_text,label
0,There is manuscript evidence that Austen conti...,1
1,"In a remarkable comparative analysis , Mandaea...",1
2,"Before Persephone was released to Hermes , who...",1
3,Cogeneration plants are commonly found in dist...,1
4,"Geneva -LRB- , ; , ; , ; ; -RRB- is the second...",1


In [10]:
df.shape

(416768, 2)

In [7]:
# # for development purposes
# df_x = df.sample(frac=1, random_state=42)
# df_x.shape

(416768, 4)

In [45]:
text = list(df['original_text'])
text

## Define the Vector Space

In [11]:
vec = TfidfVectorizer()

In [65]:
%%time
# fit the vectorizer on the corpus
# text = list(df['original_text'])
t_text = vec.fit_transform(text)

CPU times: user 4.42 s, sys: 6.62 ms, total: 4.42 s
Wall time: 4.43 s


In [66]:
df['transformed_text'] = t_text

df.head()

Unnamed: 0,original_text,label,transformed_text
0,There is manuscript evidence that Austen conti...,1,"(0, 1710)\t0.22163597095992751\n (0, 6179)\..."
1,"In a remarkable comparative analysis , Mandaea...",1,"(0, 1710)\t0.22163597095992751\n (0, 6179)\..."
2,"Before Persephone was released to Hermes , who...",1,"(0, 1710)\t0.22163597095992751\n (0, 6179)\..."
3,Cogeneration plants are commonly found in dist...,1,"(0, 1710)\t0.22163597095992751\n (0, 6179)\..."
4,"Geneva -LRB- , ; , ; , ; ; -RRB- is the second...",1,"(0, 1710)\t0.22163597095992751\n (0, 6179)\..."


In [39]:
# when to perform the vectorization of the mean? 

In [51]:
df.shape

(416768, 2)

In [52]:
df['label'].unique()

array([1, 0])

In [55]:
zeros = df[df['label'] == 0]
ones = df[df['label'] == 1]

### Class 0: No Simplification needed

In [57]:
zero_text = list(zeros['original_text'])
# zero_text

In [58]:
# matrix of zero class transformed vectors 
matrix_tz = vec.transform(zero_text)
matrix_tz.shape

(208384, 142341)

In [59]:
mean_tz = np.mean(matrix_tz, axis=0)

### Class 1: Needs Simplification

In [61]:
ones_text = list(ones['original_text'])

In [62]:
matrix_ones = vec.transform(ones_text)
matrix_ones.shape

(208384, 142341)

In [63]:
mean_t_one = np.mean(matrix_ones, axis=0)

## Prediction by cosine similarity

In [None]:
# down sample the dataframe to make the operation manageable 
df_small = df.sample(frac=0.1, replace=False, random_state=42 )

In [69]:
def assign_similarity(transformed_vector, mean_transformed_zeros = mean_tz, mean_transformed_ones = mean_t_one):
    similar_zero = cosine_similarity(transformed_vector, mean_transformed_zeros)[0]
    similar_one = cosine_similarity(transformed_vector, mean_transformed_ones)[0]

    if similar_zero > similar_one:
        return 0
    else:
        return 1

In [70]:
%%time
df_small['predicted'] = df_small.apply(lambda x: assign_similarity(x['transformed_text']), axis=1)

KeyboardInterrupt: 