In [70]:
import pandas as pd
import numpy as np

In [206]:
train = pd.read_csv('PA3_train.tsv', sep='\t', names = ['annotation','review'], header=None) 
test = pd.read_csv('PA3_test_clean.tsv', sep='\t', names = ['annotation','review'], header=None) 

# Training Data Cleaning

In [207]:
train.head()

Unnamed: 0,annotation,review
0,0/0,Ordered my food the hole meal looked dead. pla...
1,1/1,We stopped her whilst walking in the Haga area...
2,0/0,"Bad experience, On 23/03/19 Myself and my part..."
3,0/0,Extremely underwhelming experience here last n...
4,0/0,Waited 30 minutes to get a table…that was ok. ...


In [208]:
train.shape

(7018, 2)

In [209]:
train.review[1]

'We stopped her whilst walking in the Haga area. The Cafe is well recommended. Good service and we enjoyed our teas and a Cinamon Roll. The latter was large but so good that between us we finished it! Recommended stop off.'

In [210]:
train[['annotation1','annotation2']]=train.annotation.str.split('/', expand=True)

In [211]:
train.head()

Unnamed: 0,annotation,review,annotation1,annotation2
0,0/0,Ordered my food the hole meal looked dead. pla...,0,0
1,1/1,We stopped her whilst walking in the Haga area...,1,1
2,0/0,"Bad experience, On 23/03/19 Myself and my part...",0,0
3,0/0,Extremely underwhelming experience here last n...,0,0
4,0/0,Waited 30 minutes to get a table…that was ok. ...,0,0


In [212]:
train.annotation.unique()

array(['0/0', '1/1', '1/0', '-1/0', '-1/1', '0/1', '2/1', '2/0', '1/',
       '9/1'], dtype=object)

In [213]:
train.annotation1.unique()

array(['0', '1', '-1', '2', '9'], dtype=object)

In [214]:
train.annotation2.unique()

array(['0', '1', ''], dtype=object)

In [215]:
train[train.annotation2=='']=train[train.annotation2==''].replace(r'^\s*$', np.nan, regex=True)

In [216]:
train.head()

Unnamed: 0,annotation,review,annotation1,annotation2
0,0/0,Ordered my food the hole meal looked dead. pla...,0,0
1,1/1,We stopped her whilst walking in the Haga area...,1,1
2,0/0,"Bad experience, On 23/03/19 Myself and my part...",0,0
3,0/0,Extremely underwhelming experience here last n...,0,0
4,0/0,Waited 30 minutes to get a table…that was ok. ...,0,0


In [217]:
train.annotation2.unique()

array(['0', '1', nan], dtype=object)

In [218]:
train.annotation2.describe()

count     7017
unique       2
top          1
freq      3644
Name: annotation2, dtype: object

In [219]:
train[train.annotation2=='1'].annotation2.count()

3644

In [220]:
train[train.annotation2=='0'].annotation2.count()

3373

In [221]:
train['annotation2'].fillna(train['annotation1'],inplace=True)

In [222]:
train.annotation2.unique()

array(['0', '1'], dtype=object)

In [223]:
train.annotation2.describe()

count     7018
unique       2
top          1
freq      3645
Name: annotation2, dtype: object

In [224]:
train[train.annotation2=='1'].annotation2.count()

3645

In [225]:
train[train.annotation2=='0'].annotation2.count()

3373

In [237]:
train_df=train[['review', 'annotation2']]
train_df=train_df.rename(columns={"annotation2": "annotation"})
train_df.annotation=pd.to_numeric(train_df.annotation)

In [238]:
train_df.head()

Unnamed: 0,review,annotation
0,Ordered my food the hole meal looked dead. pla...,0
1,We stopped her whilst walking in the Haga area...,1
2,"Bad experience, On 23/03/19 Myself and my part...",0
3,Extremely underwhelming experience here last n...,0
4,Waited 30 minutes to get a table…that was ok. ...,0


In [239]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7018 entries, 0 to 7017
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   review      7018 non-null   object
 1   annotation  7018 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 109.8+ KB


# Test data cleaning

In [241]:
test.head()

Unnamed: 0,annotation,review
0,0,Over all I felt a bit disappointing with abov...
1,1,A wonderful experience!
2,1,Always very delicious dishes and attentive ser...
3,1,Amazing as always
4,1,"Amazing food, the aubergine mess and the Tunis..."


In [242]:
test.shape

(1751, 2)

In [246]:
test.annotation.unique()

array([0, 1], dtype=int64)

In [247]:
test[test.annotation== 1 ].annotation.count()

886

In [248]:
test[test.annotation== 0 ].annotation.count()

865

# TF-IDF vectorization

In [269]:
# the actual classification algorithm
from sklearn.svm import LinearSVC

# for converting training and test datasets into matrices
# TfidfVectorizer does this specifically for documents
from sklearn.feature_extraction.text import TfidfVectorizer

# for bundling the vectorizer and the classifier as a single "package"
from sklearn.pipeline import make_pipeline

# for splitting the dataset into training and test sets 
from sklearn.model_selection import train_test_split

# for evaluating the quality of the classifier
from sklearn.metrics import accuracy_score

In [270]:
tfidfvec = TfidfVectorizer()
X_train = tfidfvec.fit_transform(train_df.review)
X_test = tfidfvec.fit_transform(test.review)

# ML models