# Text Classification with Naive Bayes

### Create benchmarking modell

## Load Data

In [100]:
import numpy as np 
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.naive_bayes import ComplementNB
  
df = []
df = pd.read_excel('sentences_with_sentiment.xlsx', sheet_name = 'Sheet1')

df.set_index('ID', inplace=True)

print(df.shape)
df.head(2)

(266, 4)


Unnamed: 0_level_0,Sentence,Positive,Negative,Neutral
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,The results in 2nd line treatment show an ORR ...,1,0,0
2,The long duration of response and high durable...,1,0,0


### Add Labels

In [101]:
labels = []

for i in df.index:
    cont = 0
    if df.loc[i,'Positive'] == 1:
        labels.append('Positive')
        cont +=1
        
    if df.loc[i,'Negative'] == 1:
        labels.append('Negative')
        cont +=1
        
    if df.loc[i,'Neutral'] == 1:
        labels.append('Neutral')
        cont+=1
        
    if cont != 1:
        raise("Fehler Label")
        
df['labels'] = labels
df.drop(['Positive','Negative','Neutral'],axis=1,inplace=True)

### Shuffle Labels to test Classifier
#df['labels'] = np.random.permutation(df['labels'].values)

df.head()

Unnamed: 0_level_0,Sentence,labels
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,The results in 2nd line treatment show an ORR ...,Positive
2,The long duration of response and high durable...,Positive
3,The median OS time in the updated results exce...,Neutral
4,"Therefore, the clinical benefit in 2nd line tr...",Positive
5,"The data provided in 1st line, although prelim...",Positive


### Make Train/Test

In [102]:
X_train, X_test, y_train, y_test = train_test_split(df.Sentence, df.labels, test_size = 0.15, random_state = 42)
X_train.shape, X_test.shape, y_train.head(2), y_test.head(2)

((226,), (40,), ID
 16     Positive
 194     Neutral
 Name: labels, dtype: object, ID
 182    Positive
 120     Neutral
 Name: labels, dtype: object)

## Feature Engineering

### Vectorized Representation - A lot can be done here, lemmantize, n-grams, etc.

In [103]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

vectorizer = CountVectorizer(stop_words = 'english')
X_train_counts = vectorizer.fit_transform(X_train)

# transform a count matrix to a normalized tf-idf representation (tf-idf transformer)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

print(X_train_tfidf.shape)

(226, 1100)


### Train Classifier
#### Compare "normal" Multinomial Bayes vs. Complement Bayes (works better with umbalanced data)

In [104]:
#clf = MultinomialNB().fit(X_train_tfidf, y_train)
clf = ComplementNB().fit(X_train_tfidf, y_train)

### Predict Test

In [105]:
X_test_counts = vectorizer.transform(X_test)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)
predicted = clf.predict(X_test_tfidf)
probabilities = clf.predict_proba(X_test_tfidf) ### Get the Probabilities of each sample belonging to a class

### Evaluating Classifier
#### Cross-Validation can also me implemented for evaluation

#### In problems with strong class imbalance, a model can be missleasing, achievieng high accuracy if it predicts the value of the majority class for all predictions.

#### Precision is the number of True Positives divided by the number of True Positives and False Positives. A low precision can also indicate a large number of False Positives.

#### Recall is the number of True Positives divided by the number of True Positives and the number of False Negatives.Recall can be thought of as a measure of a classifiers completeness. A low recall indicates many False Negatives.

#### F1 score conveys the balance between the precision and the recall.

In [106]:
# Evaluating the classifier
confusion_matrix = metrics.confusion_matrix(y_test, predicted)
prediction = clf.predict(X_test_tfidf)
prediction_train = clf.predict(X_train_tfidf)
  
print(f"Training Set Accuracy : {accuracy_score(y_train, prediction_train) * 100} %\n")
print(f"Test Set Accuracy : {accuracy_score(y_test, prediction) * 100} % \n\n")
print(f"Classifier Report : \n\n {classification_report(y_test, prediction)}")
print(f"Confussion Matrix: \n {confusion_matrix}")

Training Set Accuracy : 96.90265486725663 %

Test Set Accuracy : 65.0 % 


Classifier Report : 

               precision    recall  f1-score   support

    Negative       0.67      0.57      0.62         7
     Neutral       0.50      0.27      0.35        11
    Positive       0.68      0.86      0.76        22

   micro avg       0.65      0.65      0.65        40
   macro avg       0.62      0.57      0.58        40
weighted avg       0.63      0.65      0.62        40

Confussion Matrix: 
 [[ 4  0  3]
 [ 2  3  6]
 [ 0  3 19]]


In [120]:
i = 4
probabilities[i], prediction[i]

(array([0.21049272, 0.20269852, 0.58680876]), 'Positive')

In [121]:
clf.classes_

array(['Negative', 'Neutral', 'Positive'], dtype='<U8')