In [60]:
import numpy as np
import pandas as pd
import preprocess as PP
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Read train data
train_df = pd.read_csv(filepath_or_buffer="../Data/Training/train.csv",header=None)

# Read test data
test_df = pd.read_csv(filepath_or_buffer="../Data/Training/test.csv",header=None)

# read stop word from data and add to stop_words list
with open("../Data/Classifier/stop-word.txt") as f:
    stop_word_list = f.readlines()
stop_word_list = [x.strip() for x in stop_word_list]

In [4]:
stop_word_list

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 "can't",
 'cannot',
 'could',
 "couldn't",
 'did',
 "didn't",
 'do',
 'does',
 "doesn't",
 'doing',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 "hadn't",
 'has',
 "hasn't",
 'have',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 "he's",
 'her',
 'here',
 "here's",
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 "how's",
 'i',
 "i'd",
 "i'll",
 "i'm",
 "i've",
 'if',
 'in',
 'into',
 'is',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 "let's",
 'me',
 'more',
 'most',
 "mustn't",
 'my',
 'myself',
 'no',
 'nor',
 'not',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'ought',
 'our',
 'ours \tourselves',
 'out',
 'over',
 'own',
 'same',
 "shan't",
 'she',
 "she'd",
 "she'll",
 "she's",
 'should',
 "sho

In [5]:
train_df.head()

Unnamed: 0,0,1
0,positive,the rock is destined to be the 21st century's ...
1,positive,"the gorgeously elaborate continuation of "" the..."
2,positive,effective but too-tepid biopic
3,positive,if you sometimes like to go to the movies to h...
4,positive,"emerges as something rare , an issue movie tha..."


In [7]:
len(train_df)

10000

In [9]:
len(train_df[train_df[0]=="positive"])

5000

In [10]:
len(train_df[train_df[0]=="negative"])

5000

In [11]:
train_df.loc[train_df[0]=='positive',0,]=1

In [12]:
train_df.loc[train_df[0]=='negative',0,]=0

In [13]:
train_df.head()

Unnamed: 0,0,1
0,1,the rock is destined to be the 21st century's ...
1,1,"the gorgeously elaborate continuation of "" the..."
2,1,effective but too-tepid biopic
3,1,if you sometimes like to go to the movies to h...
4,1,"emerges as something rare , an issue movie tha..."


In [14]:
train_df.tail()

Unnamed: 0,0,1
9995,0,showtime's starry cast could be both an asset ...
9996,0,"a determined , ennui-hobbled slog that really ..."
9997,0,too daft by half . . . but supremely good natu...
9998,0,fails in making this character understandable ...
9999,0,it's a shame that the storyline and its underl...


In [47]:
train_tweets = train_df[1].tolist()

# for i in range(len(train_df.head())):
#     train_tweets.append(train_df.head().loc[i][1])

In [48]:
len(train_tweets)

10000

In [49]:
for i in range(len(train_tweets)):
    train_tweets[i]=PP.clean_tweet(train_tweets[i])

In [50]:
cv = TfidfVectorizer(min_df=1,stop_words=stop_word_list)

In [51]:
train_cv = cv.fit_transform(train_tweets)

In [52]:
train_cv.toarray()

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [53]:
cv.get_feature_names()

['00',
 '000',
 '007',
 '10',
 '100',
 '101',
 '102',
 '103',
 '104',
 '105',
 '10th',
 '11',
 '110',
 '112',
 '117',
 '11th',
 '12',
 '120',
 '123',
 '125',
 '127',
 '129',
 '12th',
 '13',
 '133',
 '13th',
 '14',
 '140',
 '146',
 '15',
 '15th',
 '16',
 '163',
 '168',
 '17',
 '170',
 '179',
 '1790',
 '18',
 '180',
 '1873',
 '1899',
 '18th',
 '19',
 '1915',
 '1920',
 '1930s',
 '1933',
 '1934',
 '1938',
 '1940s',
 '1949',
 '1950',
 '1950s',
 '1952',
 '1953',
 '1954',
 '1955',
 '1957',
 '1958',
 '1960',
 '1960s',
 '1962',
 '1967',
 '1970',
 '1970s',
 '1971',
 '1972',
 '1973',
 '1975',
 '1978',
 '1979',
 '1980',
 '1980s',
 '1982',
 '1983',
 '1984',
 '1986',
 '1987',
 '1988',
 '1989',
 '1990',
 '1991',
 '1992',
 '1994',
 '1995',
 '1997',
 '1998',
 '1999',
 '19th',
 '20',
 '2000',
 '2001',
 '2002',
 '20th',
 '21',
 '21st',
 '22',
 '24',
 '2455',
 '25',
 '2525',
 '25s',
 '26',
 '270',
 '28k',
 '295',
 '30',
 '300',
 '3000',
 '30s',
 '33',
 '37',
 '3d',
 '40',
 '400',
 '401',
 '40s',
 '42',
 '

In [55]:
df = pd.read_csv(filepath_or_buffer="../Data/Training/training_dataset.csv",names=['senti','text'])

In [61]:
df_x = df['text']
df_y = df['senti']
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2, random_state=4)

In [62]:
len(df)

19332

In [63]:
len(x_train)

15465

In [64]:
len(x_test)

3867