In [None]:
#get data from Phillip's google drive
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
#path of data
gdrive_path = '/content/gdrive/My Drive/Sentiment Analysis Data/'
#following line opens amazon data
#data is split into sentence \t score \n
#with open(f'{gdrive_path}amazon_cells_labelled.txt', 'r') as file:

In [None]:
import nltk
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS 
from sklearn.feature_extraction import text
from sklearn.linear_model import LogisticRegression as LR
from sklearn.model_selection import cross_val_score

#take list of filepaths to get data
filepaths = {'amazon': 'amazon_cells_labelled.txt',
               'yelp' : 'yelp_labelled.txt',
               'imdb': 'imdb_labelled.txt'}

#populate dataframes with data, separating sentences from the scores
dfs = []
for source, path in filepaths.items():
  df = pd.read_csv(path, names = ['sentence', 'score'], sep = '\t')
  #df['source'] = source
  dfs.append(df)
df = pd.concat(dfs)
print(df)

                                              sentence  score
0    So there is no way for me to plug it in here i...      0
1                          Good case, Excellent value.      1
2                               Great for the jawbone.      1
3    Tied to charger for conversations lasting more...      0
4                                    The mic is great.      1
..                                                 ...    ...
743  I just got bored watching Jessice Lange take h...      0
744  Unfortunately, any virtue in this film's produ...      0
745                   In a word, it is embarrassing.        0
746                               Exceptionally bad!        0
747  All in all its an insult to one's intelligence...      0

[2748 rows x 2 columns]


In [None]:
#remove stopwords and lemmatize

stops = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
#sentences = df['sentence'].values
modified_sentences = []


for i in range(df['sentence'].size):
  modified_sentence = []
  words = word_tokenize(df.iloc[i,0])
  for word in words:
    if word not in stops:
      lemmatizer.lemmatize(word)
      modified_sentence.append(word)
      #sentence = ' '.join(modified_sentence)
      df.iat[i,0] = modified_sentence
print(df)

                                              sentence  score
0     [So, way, plug, US, unless, I, go, converter, .]      0
1                 [Good, case, ,, Excellent, value, .]      1
2                                  [Great, jawbone, .]      1
3    [Tied, charger, conversations, lasting, 45, mi...      0
4                                 [The, mic, great, .]      1
..                                                 ...    ...
743  [I, got, bored, watching, Jessice, Lange, take...      0
744  [Unfortunately, ,, virtue, film, 's, productio...      0
745                     [In, word, ,, embarrassing, .]      0
746                            [Exceptionally, bad, !]      0
747  [All, insult, one, 's, intelligence, huge, was...      0

[2748 rows x 2 columns]


In [None]:
#vectorize text
vectorizer = text.TfidfVectorizer(min_df = 1, sublinear_tf = True, use_idf = True, ngram_range = (1, 2))
for i in range(df['sentence'].size):
  X = vectorizer.fit_transform(df.iloc[i,0])
  df.iat[i,0] = X.toarray()
print(df.iloc[0,0])

[[0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]]


In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['sentence'], df['score'], test_size = 0.33, shuffle = True)
print(X_train)
print(y_train)

311    [[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0], [0.0, 0....
26     [[0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0....
675    [[1.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0....
23     [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...
474    [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...
                             ...                        
627    [[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0], [0....
650    [[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0....
860    [[0.0, 1.0, 0.0], [0.0, 0.0, 1.0], [1.0, 0.0, ...
610    [[0.0, 0.0, 1.0, 0.0], [0.0, 0.0, 0.0, 0.0], [...
281     [[0.0, 0.0], [0.0, 1.0], [1.0, 0.0], [0.0, 0.0]]
Name: sentence, Length: 1841, dtype: object
311    0
26     0
675    0
23     1
474    0
      ..
627    0
650    0
860    0
610    0
281    1
Name: score, Length: 1841, dtype: int64


In [None]:
log_reg = LR()
labels = cross_val_score(log_reg, df['sentence'], df['score'], cv= 10, scoring = 'f1_macro')
print(labels.mean())

NameError: ignored