# machine learning intro

## importing and get files paths

In [91]:
import pandas as pd
# train data
from sklearn.model_selection import train_test_split
# Shape the data in an acceptable shape by Regression Model
from sklearn.feature_extraction.text import CountVectorizer
# Train/classify the data using Logistic Regression
from sklearn.linear_model import LogisticRegression
import matplotlib

filePathDec = {'yelp':'data/yelp_labelled.txt',
                'amazon':'data/amazon_cells_labelled.txt', 
                'imdb':'data/imdb_labelled.txt'
              }


## make all data in files as list and change the extention as a csv files

In [92]:
df_list=[]
for source, filepath in filePathDec.items():
    df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')
    df['source'] =source
    df_list.append(df)

df_list

[                                              sentence  label source
 0                             Wow... Loved this place.      1   yelp
 1                                   Crust is not good.      0   yelp
 2            Not tasty and the texture was just nasty.      0   yelp
 3    Stopped by during the late May bank holiday of...      1   yelp
 4    The selection on the menu was great and so wer...      1   yelp
 ..                                                 ...    ...    ...
 995  I think food should have flavor and texture an...      0   yelp
 996                           Appetite instantly gone.      0   yelp
 997  Overall I was not impressed and would not go b...      0   yelp
 998  The whole experience was underwhelming, and I ...      0   yelp
 999  Then, as if I hadn't wasted enough of my life ...      0   yelp
 
 [1000 rows x 3 columns],
                                               sentence  label  source
 0    So there is no way for me to plug it in here i...      

## concatenating data as csv review

In [93]:
df = pd.concat(df_list)
df

Unnamed: 0,sentence,label,source
0,Wow... Loved this place.,1,yelp
1,Crust is not good.,0,yelp
2,Not tasty and the texture was just nasty.,0,yelp
3,Stopped by during the late May bank holiday of...,1,yelp
4,The selection on the menu was great and so wer...,1,yelp
...,...,...,...
743,I just got bored watching Jessice Lange take h...,0,imdb
744,"Unfortunately, any virtue in this film's produ...",0,imdb
745,"In a word, it is embarrassing.",0,imdb
746,Exceptionally bad!,0,imdb


## divid data for each file into test and training data

In [94]:
# for yelp data
df_yelb = df[df['source'] == 'yelp']
sentence_values_yelp = df_yelb['sentence'].values # x-axis
label_values_yelp = df_yelb['label'].values # y-axis
x_train_yelp, x_test_yelp, y_train_yelp, y_test_yelp = train_test_split(sentence_values_yelp, label_values_yelp, test_size=0.2, random_state=1)

# for amazon data
df_amazon = df[df['source'] == 'amazon']
sentence_values_amazon = df_amazon['sentence'].values # x-axis
label_values_amazon = df_amazon['label'].values # y-axis
x_train_amazon, x_test_amazon, y_train_amazon, y_test_amazon = train_test_split(sentence_values_amazon, label_values_amazon, test_size=0.2, random_state=2)

# for imdb data
df_imdb = df[df['source'] == 'imdb']
sentence_values_imdb = df_imdb['sentence'].values # x-axis
label_values_imdb = df_imdb['label'].values # y-axis
x_train_imdb, x_test_imdb, y_train_imdb, y_test_imdb = train_test_split(sentence_values_imdb, label_values_imdb, test_size=0.2, random_state=3)


# train data and transform it as series of 1 and 0 of unique words

In [95]:
# for yelp data
vectorizer_yelp = CountVectorizer()
vectorizer_yelp.fit(x_train_yelp)

transformed_x_train_yelp = vectorizer_yelp.transform(x_train_yelp)
transformed_x_test_yelp = vectorizer_yelp.transform(x_test_yelp)

# for amazon data
vectorizer_amazon = CountVectorizer()
vectorizer_amazon.fit(x_train_amazon)

transformed_x_train_amazon = vectorizer_amazon.transform(x_train_amazon)
transformed_x_test_amazon = vectorizer_amazon.transform(x_test_amazon)

# for imdb data
vectorizer_imdb = CountVectorizer()
vectorizer_imdb.fit(x_train_imdb)

transformed_x_train_imdb = vectorizer_imdb.transform(x_train_imdb)
transformed_x_test_imdb = vectorizer_imdb.transform(x_test_imdb)

## make a model and test it 

In [96]:
# for yelp data
classifier_yelp = LogisticRegression()
classifier_yelp.fit(transformed_x_train_yelp, y_train_yelp)
score_yelp = classifier_yelp.score(transformed_x_test_yelp, y_test_yelp)
score_yelp

# for amazon data
classifier_amazon = LogisticRegression()
classifier_amazon.fit(transformed_x_train_amazon, y_train_amazon)
score_amazon = classifier_amazon.score(transformed_x_test_amazon, y_test_amazon)
score_amazon

# for imdb data
classifier_imdb = LogisticRegression()
classifier_imdb.fit(transformed_x_train_imdb, y_train_imdb)
score_imdb = classifier_imdb.score(transformed_x_test_imdb, y_test_imdb)
print('score for yelp= ', score_yelp)
print('score for amazon= ', score_amazon)
print('score for imdb= ', score_imdb)


score for yelp=  0.84
score for amazon=  0.81
score for imdb=  0.7533333333333333


## try to do keras

In [97]:
from keras.models import Sequential
from keras import layers

ModuleNotFoundError: No module named 'tensorflow'