# Text Classification with Naive Bayes

### Establisch basic modell for text classification that serves as benchmarking

## Load Data

In [70]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix, accuracy_score

df = []
df = pd.read_excel('sentences_with_sentiment.xlsx', sheet_name = 'Sheet1')

df.set_index('ID', inplace=True)

print(df.shape)
df.head(2)

(266, 4)


Unnamed: 0_level_0,Sentence,Positive,Negative,Neutral
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,The results in 2nd line treatment show an ORR ...,1,0,0
2,The long duration of response and high durable...,1,0,0


### Add Labels

In [71]:
labels = []

for i in df.index:
    cont = 0
    if df.loc[i,'Positive'] == 1:
        labels.append('Positive')
        cont +=1
        
    if df.loc[i,'Negative'] == 1:
        labels.append('Negative')
        cont +=1
        
    if df.loc[i,'Neutral'] == 1:
        labels.append('Neutral')
        cont+=1
        
    if cont != 1:
        raise("Fehler Label")
        
df['labels'] = labels
df.drop(['Positive','Negative','Neutral'],axis=1,inplace=True)
df.head()

Unnamed: 0_level_0,Sentence,labels
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,The results in 2nd line treatment show an ORR ...,Positive
2,The long duration of response and high durable...,Positive
3,The median OS time in the updated results exce...,Neutral
4,"Therefore, the clinical benefit in 2nd line tr...",Positive
5,"The data provided in 1st line, although prelim...",Positive


### Make Train/Test

In [72]:
msk = np.random.rand(len(df)) < 0.75
train = df[msk]
test = df[~msk]
train.shape, test.shape, train.head(2), test.head(2)

((204, 2),
 (62, 2),
                                              Sentence    labels
 ID                                                             
 1   The results in 2nd line treatment show an ORR ...  Positive
 2   The long duration of response and high durable...  Positive,
                                              Sentence    labels
 ID                                                             
 5   The data provided in 1st line, although prelim...  Positive
 9   The European Medicines Agency will review new ...   Neutral)

## Feature Engineering

### Vectorized Representation
### A lot can be done here, lemmantize, n-grams, etc.

In [74]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

sentences = df.Sentence.values

vectorizer = CountVectorizer(stop_words = 'english')
X_train_counts = vectorizer.fit_transform(train.Sentence)

# transform a count matrix to a normalized tf-idf representation (tf-idf transformer)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

print(X_train_tfidf.shape)

(204, 1053)


## Imbalance Data check solutions

### Train Classifier

In [75]:
clf = MultinomialNB().fit(X_train_tfidf, train.labels)

### Predict Test

In [101]:
X_test_counts = vectorizer.transform(test.Sentence)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)
predicted = clf.predict(X_test_tfidf)
probabilities = clf.predict_proba(X_test_tfidf)

In [77]:
print('We got an accuracy of',np.mean(predicted == test.labels)*100, '% over the test data.')

We got an accuracy of 59.67741935483871 % over the test data.


### Confusion Matrix

In [78]:
import matplotlib.pyplot as plt
import numpy
from sklearn import metrics

confusion_matrix = metrics.confusion_matrix(test.labels, predicted)
confusion_matrix

array([[ 0,  2,  6],
       [ 0,  0, 17],
       [ 0,  0, 37]], dtype=int64)

In [97]:
len(test.labels[test.labels=='Neutral'])

17

In [103]:
probabilities[0]

array([0.16547718, 0.13730685, 0.69721597])

In [98]:
output = pd.DataFrame(test.labels.values, columns =['Test'])
output['Pred'] = predicted
output.tail()

Unnamed: 0,Test,Pred
57,Neutral,Positive
58,Positive,Positive
59,Positive,Positive
60,Positive,Positive
61,Neutral,Positive
