In [8]:
import pandas as pd

In [9]:
filepath_dict = {'yelp':   'data/sentiment_analysis/yelp_labelled.txt',
                 'amazon': 'data/sentiment_analysis/amazon_cells_labelled.txt',
                 'imdb':   'data/sentiment_analysis/imdb_labelled.txt'}

In [10]:

df_list = []
for source, filepath in filepath_dict.items():
    df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')
    df['source'] = source
    df_list.append(df)
df = pd.concat(df_list)
print(df.iloc[0])

sentence    Wow... Loved this place.
label                              1
source                          yelp
Name: 0, dtype: object


In [11]:
sentences = ['John likes ice cream', 'John hates chocolate.']

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

In [13]:
vectorizer = CountVectorizer(min_df=0, lowercase=False)

In [14]:
vectorizer.fit(sentences)

CountVectorizer(lowercase=False, min_df=0)

In [15]:
vectorizer.vocabulary_

{'John': 0, 'likes': 5, 'ice': 4, 'cream': 2, 'hates': 3, 'chocolate': 1}

In [16]:
vectorizer.transform(sentences).toarray()

array([[1, 0, 1, 0, 1, 1],
       [1, 1, 0, 1, 0, 0]])

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
df_yelp = df[df['source']=='yelp']

In [19]:
sentences = df_yelp['sentence'].values

In [20]:
y = df_yelp['label'].values

In [22]:
sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, test_size=0.25, random_state=1000)

In [23]:
from sklearn.feature_extraction.text import CountVectorizer

In [24]:
vectorizer = CountVectorizer()

In [25]:
vectorizer.fit(sentences_train)

CountVectorizer()

In [26]:
X_train = vectorizer.transform(sentences_train)

In [27]:
X_test  = vectorizer.transform(sentences_test)

In [28]:
X_train

<750x1714 sparse matrix of type '<class 'numpy.int64'>'
	with 7368 stored elements in Compressed Sparse Row format>

In [29]:
from sklearn.linear_model import LogisticRegression

In [30]:
classifier = LogisticRegression()

In [31]:
classifier.fit(X_train, y_train)

LogisticRegression()

In [32]:
score = classifier.score(X_test, y_test)

In [33]:
print("Accuracy:", score)

Accuracy: 0.796


In [34]:
for source in df['source'].unique():
    df_source = df[df['source'] == source]
    sentences = df_source['sentence'].values
    y = df_source['label'].values

    sentences_train, sentences_test, y_train, y_test = train_test_split(
        sentences, y, test_size=0.25, random_state=1000)

    vectorizer = CountVectorizer()
    vectorizer.fit(sentences_train)
    X_train = vectorizer.transform(sentences_train)
    X_test  = vectorizer.transform(sentences_test)

    classifier = LogisticRegression()
    classifier.fit(X_train, y_train)
    score = classifier.score(X_test, y_test)
    print('Accuracy for {} data: {:.4f}'.format(source, score))

Accuracy for yelp data: 0.7960
Accuracy for amazon data: 0.7960
Accuracy for imdb data: 0.7487
