In [1]:
import pandas as pd



In [2]:
filepath_dict = {'yelp':   'yelp_labelled.txt',
                 'amazon': 'amazon_cells_labelled.txt',
                 'imdb':   'imdb_labelled.txt'}

In [3]:
df_list = []
for source, filepath in filepath_dict.items():
    df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')
    df['source'] = source  # Add another column filled with the source name
    df_list.append(df)


In [4]:
df_list

[                                              sentence  label source
 0                             Wow... Loved this place.      1   yelp
 1                                   Crust is not good.      0   yelp
 2            Not tasty and the texture was just nasty.      0   yelp
 3    Stopped by during the late May bank holiday of...      1   yelp
 4    The selection on the menu was great and so wer...      1   yelp
 ..                                                 ...    ...    ...
 995  I think food should have flavor and texture an...      0   yelp
 996                           Appetite instantly gone.      0   yelp
 997  Overall I was not impressed and would not go b...      0   yelp
 998  The whole experience was underwhelming, and I ...      0   yelp
 999  Then, as if I hadn't wasted enough of my life ...      0   yelp
 
 [1000 rows x 3 columns],
                                               sentence  label  source
 0    So there is no way for me to plug it in here i...      

In [6]:
df = pd.concat(df_list)
print(df.iloc[0])


sentence    Wow... Loved this place.
label                              1
source                          yelp
Name: 0, dtype: object


In [7]:
df

Unnamed: 0,sentence,label,source
0,Wow... Loved this place.,1,yelp
1,Crust is not good.,0,yelp
2,Not tasty and the texture was just nasty.,0,yelp
3,Stopped by during the late May bank holiday of...,1,yelp
4,The selection on the menu was great and so wer...,1,yelp
...,...,...,...
743,I just got bored watching Jessice Lange take h...,0,imdb
744,"Unfortunately, any virtue in this film's produ...",0,imdb
745,"In a word, it is embarrassing.",0,imdb
746,Exceptionally bad!,0,imdb


In [16]:
from sklearn.model_selection import train_test_split

df_yelp = df[df['source'] == 'yelp']

x = df_yelp['sentence'].values
y = df_yelp['label'].values

x_train, x_test, y_train, y_test = train_test_split(
   x, y, test_size=0.25, random_state=1000)

In [18]:
df_yelp.head(5)

Unnamed: 0,sentence,label,source
0,Wow... Loved this place.,1,yelp
1,Crust is not good.,0,yelp
2,Not tasty and the texture was just nasty.,0,yelp
3,Stopped by during the late May bank holiday of...,1,yelp
4,The selection on the menu was great and so wer...,1,yelp


In [19]:
x_train[:10]

array(['The food was barely lukewarm, so it must have been sitting waiting for the server to bring it out to us.',
       'Sorry, I will not be getting food from here anytime soon :(',
       'Of all the dishes, the salmon was the best, but all were great.',
       'The fries were not hot, and neither was my burger.',
       "In fact I'm going to round up to 4 stars, just because she was so awesome.",
       'Will go back next trip out.',
       'This was my first crawfish experience, and it was delicious!',
       "I could barely stomach the meal, but didn't complain because it was a business lunch.",
       'A great way to finish a great.',
       'Best service and food ever, Maria our server was so good and friendly she made our day.'],
      dtype=object)

## shape the data in an ecceptable shape by regression model

In [22]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(sentences_train)

X_train = vectorizer.transform(x_train)
X_test  = vectorizer.transform(x_test)
X_train

<750x1714 sparse matrix of type '<class 'numpy.int64'>'
	with 7368 stored elements in Compressed Sparse Row format>

In [23]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression()
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)

print("Accuracy:", score)

Accuracy: 0.796


## apply the above on all data

## amazon

In [26]:
df_amazon = df[df['source'] == 'amazon']

x = df_amazon['sentence'].values
y = df_amazon['label'].values

x_train, x_test, y_train, y_test = train_test_split(
   x, y, test_size=0.25, random_state=1000)

In [27]:
df_amazon.head(5)

Unnamed: 0,sentence,label,source
0,So there is no way for me to plug it in here i...,0,amazon
1,"Good case, Excellent value.",1,amazon
2,Great for the jawbone.,1,amazon
3,Tied to charger for conversations lasting more...,0,amazon
4,The mic is great.,1,amazon


In [28]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(sentences_train)

X_train = vectorizer.transform(x_train)
X_test  = vectorizer.transform(x_test)
X_train

<750x1714 sparse matrix of type '<class 'numpy.int64'>'
	with 4837 stored elements in Compressed Sparse Row format>

In [29]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression()
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)

print("Accuracy:", score)

Accuracy: 0.8


## imdb

In [30]:
df_imdb = df[df['source'] == 'amazon']

x = df_imdb['sentence'].values
y = df_imdb['label'].values

x_train, x_test, y_train, y_test = train_test_split(
   x, y, test_size=0.25, random_state=1000)

In [31]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(sentences_train)

X_train = vectorizer.transform(x_train)
X_test  = vectorizer.transform(x_test)
X_train

<750x1714 sparse matrix of type '<class 'numpy.int64'>'
	with 4837 stored elements in Compressed Sparse Row format>

In [32]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression()
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)

print("Accuracy:", score)

Accuracy: 0.8
