In [9]:
import pandas as pd

In [10]:
filepath_dict = {'yelp': 'data/yelp_labelled.txt',
                'amazon': 'data/amazon_cells_labelled.txt',
                'imdb': 'data/imdb_labelled.txt'}

In [12]:
df_list = []

for source, filepath in filepath_dict.items():
    df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')
    df['source'] = source
    df_list.append(df)

In [13]:
df= pd.concat(df_list)
df

Unnamed: 0,sentence,label,source
0,Wow... Loved this place.,1,yelp
1,Crust is not good.,0,yelp
2,Not tasty and the texture was just nasty.,0,yelp
3,Stopped by during the late May bank holiday of...,1,yelp
4,The selection on the menu was great and so wer...,1,yelp
...,...,...,...
743,I just got bored watching Jessice Lange take h...,0,imdb
744,"Unfortunately, any virtue in this film's produ...",0,imdb
745,"In a word, it is embarrassing.",0,imdb
746,Exceptionally bad!,0,imdb


# Baseline Model for yelp data

In [14]:
from sklearn.model_selection import train_test_split

df_yelp = df[df['source'] == 'yelp']

X = df_yelp['sentence'].values
Y = df_yelp['label'].values

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=100)

# Shape the data in an acceptable shape by Regression Model


In [15]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(x_train)

transformed_x_train = vectorizer.transform(x_train)
transformed_x_test  = vectorizer.transform(x_test)
transformed_x_train

<750x1724 sparse matrix of type '<class 'numpy.int64'>'
	with 7422 stored elements in Compressed Sparse Row format>

# Train/classify the data using Logistic Regression


In [18]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression()
classifier.fit(transformed_x_train, y_train)

LogisticRegression()

In [None]:
score = classifier.score(transformed_x_test, y_test)
score

0.808

# Baseline Model for amazon data

In [19]:
df_yelp = df[df['source'] == 'amazon']

X = df_yelp['sentence'].values
Y = df_yelp['label'].values

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=100)
vectorizer = CountVectorizer()
vectorizer.fit(x_train)

transformed_x_train = vectorizer.transform(x_train)
transformed_x_test  = vectorizer.transform(x_test)
classifier = LogisticRegression()

classifier.fit(transformed_x_train, y_train)
score = classifier.score(transformed_x_test, y_test)
score


0.82

# Baseline Model for imdb data

In [20]:
df_yelp = df[df['source'] == 'imdb']

X = df_yelp['sentence'].values
Y = df_yelp['label'].values

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=100)
vectorizer = CountVectorizer()
vectorizer.fit(x_train)

transformed_x_train = vectorizer.transform(x_train)
transformed_x_test  = vectorizer.transform(x_test)
classifier = LogisticRegression()

classifier.fit(transformed_x_train, y_train)
score = classifier.score(transformed_x_test, y_test)
score

0.7379679144385026