In [35]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [9]:
filepath_dict = {'yelp': '../data/yelp_labelled.txt',
                         'amazon': '../data/amazon_cells_labelled.txt',
                         'imdb': '../data/imdb_labelled.txt'}

In [11]:
df_list = []
for source, filepath in filepath_dict.items():
    df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')
    df['source'] = source
    df_list.append(df)
    
df=pd.concat(df_list)

In [12]:
df

Unnamed: 0,sentence,label,source
0,Wow... Loved this place.,1,yelp
1,Crust is not good.,0,yelp
2,Not tasty and the texture was just nasty.,0,yelp
3,Stopped by during the late May bank holiday of...,1,yelp
4,The selection on the menu was great and so wer...,1,yelp
...,...,...,...
743,I just got bored watching Jessice Lange take h...,0,imdb
744,"Unfortunately, any virtue in this film's produ...",0,imdb
745,"In a word, it is embarrassing.",0,imdb
746,Exceptionally bad!,0,imdb


### ex the text features

In [13]:
#eg:
sentences = ['Join likes ice cream', 'John hates chocolate']

In [15]:
vectorizer = CountVectorizer(min_df=0, lowercase=True)
vectorizer.fit(sentences)
print(vectorizer.vocabulary_)
print(vectorizer.transform(sentences).toarray())

{'join': 5, 'likes': 6, 'ice': 3, 'cream': 1, 'john': 4, 'hates': 2, 'chocolate': 0}
[[0 1 0 1 0 1 1]
 [1 0 1 0 1 0 0]]


In [18]:
#  split the dataset 
df_yelp = df[df['source'] == 'yelp']
sentences = df_yelp['sentence'].values
y = df_yelp['label'].values
sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, test_size=0.25, random_state=1000)
# print(sentences_train)
# print(sentences_test)
# print(y_train)
# print(y_test)

In [20]:
vectorizer = CountVectorizer()
vectorizer.fit(sentences_train)
X_train = vectorizer.transform(sentences_train)
X_test = vectorizer.transform(sentences_test)

In [22]:
X_train.shape

(750, 1714)

In [23]:
X_test.shape

(250, 1714)

In [34]:
X_train[0].toarray()

array([[0, 0, 0, ..., 0, 0, 0]], dtype=int64)

First use some logisticRegression for a try

In [36]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)
score

0.796

Compare three different datasets

In [44]:
class Dataprocess():
    def __init__(self, data, source):
        self.df = data[data['source'] == source]
        self.sentences = self.df['sentence'].values
        self.y = self.df['label'].values
        
    def train_test_data_split(self):
        sentences_train, sentences_test, y_train, y_test = train_test_split(self.sentences, 
                                                                                                                 self.y, 
                                                                                                                 test_size=0.25, 
                                                                                                                 random_state=1000)
        return sentences_train, sentences_test, y_train, y_test      

In [47]:
class CountTheVectorizer():
    def __init__(self):
        self.vectorizer = CountVectorizer()
        
    def fit_ve(self, sentences_train, sentences_test):
        self.vectorizer.fit(sentences_train)
        X_train = self.vectorizer.transform(sentences_train)
        X_test = self.vectorizer.transform(sentences_test)
    
        return X_train, X_test

In [50]:
class Classfier():
    def __init__(self):
        self.classfier = LogisticRegression()
    
    def fit_model(self, X_train, y_train, X_test, y_test):
        self.classfier.fit(X_train, y_train)
        score = self.classfier.score(X_test, y_test)
        
        return score

In [52]:
def main():
    for source in df['source'].unique():
        data = Dataprocess(df, source)
        
        sentences_train, sentences_test, y_train, y_test =  data.train_test_data_split()
        
        vectorizer = CountTheVectorizer()
        X_train, X_test = vectorizer.fit_ve(sentences_train, sentences_test)
        
        classfier = Classfier()
        score = classfier.fit_model(X_train, y_train, X_test, y_test)
        print('Accuracy for {} data: {}'.format(source, score))
        
        
if __name__ == '__main__':
    main()

Accuracy for yelp data: 0.796
Accuracy for amazon data: 0.796
Accuracy for imdb data: 0.7486631016042781
