## **Load Data**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import re 
import nltk 


from sklearn import pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn import metrics

from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.metrics import classification_report



In [3]:
# Read cleaned data
train=pd.read_csv("/content/drive/MyDrive/ProjectAI/Data/Cleaned/clean_train.csv")
test=pd.read_csv("/content/drive/MyDrive/ProjectAI/Data/Cleaned/clean_test.csv")

In [4]:
train.fillna('', inplace=True)
test.fillna('', inplace=True)

In [5]:
train

Unnamed: 0,labels,text
0,1,menyrbie phil_gahan chrisitv
1,2,advice talk neighbours family exchange phone n...
2,2,coronavirus australia woolworths give elderly ...
3,2,food stock one empty please dont panic enough ...
4,0,ready go supermarket covid outbreak im paranoi...
...,...,...
41152,1,airline pilots offering stock supermarket shel...
41153,0,response complaint provided citing covid relat...
41154,2,know itâs getting tough kameronwilds rationing...
41155,1,wrong smell hand sanitizer starting turn coron...


In [6]:
test

Unnamed: 0,labels,text
0,0,trending new yorkers encounter empty supermark...
1,2,couldnt find hand sanitizer fred meyer turned ...
2,2,find protect loved ones coronavirus
3,0,panic buying hits newyork city anxious shopper...
4,1,toiletpaper dunnypaper coronavirus coronavirus...
...,...,...
3793,2,meanwhile supermarket israel people dance sing...
3794,0,panic buy lot nonperishable items echo needs f...
3795,1,asst prof economics cconces nbcphiladelphia ta...
3796,0,gov need somethings instead biar je rakyat ass...


## **Extract Features**

In [None]:
train.text

0                          menyrbie phil_gahan chrisitv   
1        advice talk neighbours family exchange phone n...
2        coronavirus australia woolworths give elderly ...
3        food stock one empty please dont panic enough ...
4        ready go supermarket covid outbreak im paranoi...
                               ...                        
41152    airline pilots offering stock supermarket shel...
41153    response complaint provided citing covid relat...
41154    know itâs getting tough kameronwilds rationing...
41155    wrong smell hand sanitizer starting turn coron...
41156    tartiicat well newused rift going  amazon rn a...
Name: text, Length: 41157, dtype: object

In [7]:
# Extract features using bags of words 
from sklearn.feature_extraction.text import CountVectorizer

# vectorize train set
vectorizer = CountVectorizer()
message = vectorizer.fit_transform(train['text'])
message.shape

(41157, 59600)

In [8]:
X_train = message
y_train = train["labels"]

In [None]:
X_train

<41157x59600 sparse matrix of type '<class 'numpy.int64'>'
	with 709818 stored elements in Compressed Sparse Row format>

In [None]:
y_train

0        1
1        2
2        2
3        2
4        0
        ..
41152    1
41153    0
41154    2
41155    1
41156    0
Name: labels, Length: 41157, dtype: int64

In [9]:
#vectorize test set

message2 = vectorizer.transform(test.text)
message2.shape

(3798, 59600)

In [10]:
X_test= message2
y_test= test['labels']

##**Logistic Regression using sklearn library**




In [12]:
import time

In [17]:
model =LogisticRegression()
start= time.time()
classifier = model.fit(X_train,y_train)
end= time.time()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
print(classifier.predict(X_train))
print(y_train.values)

[1 2 2 ... 2 2 0]
[1 2 2 ... 2 1 0]


In [15]:
# Result of the train set

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
pred = classifier.predict(X_train)
print(classification_report(y_train, pred))
print()
print("Confusion Matrix: \n", confusion_matrix(y_train, pred))
print("Accuracy: \n", accuracy_score(y_train, pred))

              precision    recall  f1-score   support

           0       0.97      0.98      0.97     15398
           1       0.98      0.95      0.96      7713
           2       0.97      0.98      0.98     18046

    accuracy                           0.97     41157
   macro avg       0.97      0.97      0.97     41157
weighted avg       0.97      0.97      0.97     41157


Confusion Matrix: 
 [[15031   113   254]
 [  193  7318   202]
 [  235    72 17739]]
Accuracy: 
 0.97402628957407


In [None]:
print(classifier.predict(X_test))
print(y_test.values)

[0 2 2 ... 1 0 2]
[0 2 2 ... 1 0 2]


In [18]:
#Accuracy of test set

pred = classifier.predict(X_test)
print(classification_report(y_test, pred))
print()
print("Accuracy: \n", accuracy_score(y_test, pred))
print('Time Train', end - start)

              precision    recall  f1-score   support

           0       0.82      0.79      0.81      1633
           1       0.67      0.68      0.68       619
           2       0.82      0.84      0.83      1546

    accuracy                           0.80      3798
   macro avg       0.77      0.77      0.77      3798
weighted avg       0.80      0.80      0.80      3798


Accuracy: 
 0.7956819378620327
Time Train 15.605563640594482


In [None]:
# Predict a random comment
dict1=[]
dict1.append("I love you")
message3= vectorizer.transform(dict1)
pred= classifier.predict(message3)

if (pred== [1]):
    print ("Neutral")
elif (pred== [0]):
    print ("Negative")
else:
    print ("Positive")

Positive


## **Implement Logistic Regression code**

In [19]:
from scipy.sparse import csr_matrix

In [20]:
from scipy.special import expit, logit


In [21]:
from sklearn.utils.extmath import safe_sparse_dot

In [22]:
import numpy as np
import matplotlib.pyplot as plt

class MultiClassLogisticRegression:
    
    def __init__(self, n_iter = 100000, thres=1e-4):
        self.n_iter = n_iter
        self.thres = thres
    
    def fit(self, X, y, batch_size=64, lr=0.001, rand_seed=4, verbose=False): 
        np.random.seed(rand_seed) 
        self.classes = np.unique(y)
        self.class_labels = {c:i for i,c in enumerate(self.classes)}
        #X = self.add_bias(X)   #  41157, 59601
        y = self.one_hot(y)   # 41157,3
        #self.loss = []
        self.weights = np.zeros(shape=(len(self.classes),X.shape[1]))  # 5, 59601
        self.fit_data(X, y, batch_size, lr, verbose)
        return self
 
    def fit_data(self, X, y, batch_size, lr, verbose):
        i = 0
        while (not self.n_iter or i < self.n_iter):
            #self.loss.append(self.cross_entropy(y, self.predict_(X)))
            idx = np.random.choice(X.shape[0], batch_size) 
            X_batch, y_batch = X[idx], y[idx] 
 
            error = y_batch - self.predict_(X_batch)   # batch, 5

            # Update weights 
            update = (lr * safe_sparse_dot(error.T, X_batch))  # 5,
            self.weights += update  
            
            if np.abs(update).max() < self.thres: break
            if i % 1000 == 0 and verbose: 
                print(' Training Accuray at {} iterations is {}'.format(i, self.evaluate_(X, y)))
            i +=1
    
    def predict(self, X):
        return self.predict_(self.add_bias(X))
    
    def predict_(self, X):
        pre_vals = safe_sparse_dot(X, self.weights.T).reshape(-1,len(self.classes))
        return self.softmax(pre_vals)
    
    def sigmoid(self, z):
        return 1/(1+np.exp(-z))

    def softmax(self, z):
        return np.exp(z) / np.sum(np.exp(z), axis=1).reshape(-1,1)
    
    def predict_classes(self, X):
        self.probs_ = self.predict_(X)
        return np.vectorize(lambda c: self.classes[c])(np.argmax(self.probs_, axis=1))
    
    def add_bias(self,X):
        return np.insert(X, 0, 1, axis=1)
    
    def get_random_weights(self, row, col):
        return np.zeros(shape=(row,col))
    
    def one_hot(self, y):
        return np.eye(len(self.classes))[np.vectorize(lambda c: self.class_labels[c])(y).reshape(-1)]
    
    def score(self, X, y):
        return np.mean(self.predict_classes(X) == y)
    
    def evaluate_(self, X, y):
        return np.mean(np.argmax(self.predict_(X), axis=1) == np.argmax(y, axis=1))
    
    def cross_entropy(self, y, probs):
        return -1 * np.mean(y * np.log(probs))

In [23]:
classifier1 = MultiClassLogisticRegression()
start= time.time()
model1 = classifier1.fit(X_train,y_train)
end= time.time()



In [None]:
# Predict 
y_pred1= classifier1.predict_classes(X_train)

In [None]:
y_pred2 =classifier1.predict_classes(X_test)

In [None]:
# Accuracy of train set
print(classification_report(y_train, y_pred1))
print()
print("Confusion Matrix: \n", confusion_matrix(y_train, y_pred1))
print("Accuracy: \n", accuracy_score(y_train, y_pred1))

              precision    recall  f1-score   support

           0       0.92      0.92      0.92     15398
           1       0.91      0.85      0.88      7713
           2       0.93      0.94      0.93     18046

    accuracy                           0.92     41157
   macro avg       0.92      0.91      0.91     41157
weighted avg       0.92      0.92      0.92     41157


Confusion Matrix: 
 [[14209   370   819]
 [  583  6576   554]
 [  736   290 17020]]
Accuracy: 
 0.918555774230386


In [None]:
# Accuracy of test set
print(classification_report(y_test, y_pred2))
print()
print("Confusion Matrix: \n", confusion_matrix(y_test, y_pred2))
print("Accuracy: \n", accuracy_score(y_test, y_pred2))

              precision    recall  f1-score   support

           0       0.80      0.79      0.80      1633
           1       0.68      0.65      0.66       619
           2       0.81      0.84      0.83      1546

    accuracy                           0.79      3798
   macro avg       0.77      0.76      0.76      3798
weighted avg       0.79      0.79      0.79      3798


Confusion Matrix: 
 [[1295  119  219]
 [ 138  402   79]
 [ 179   70 1297]]
Accuracy: 
 0.7883096366508688
