# Foundation of machine learning project
1. Write a python class to implement Gaussian Discriminant Analysis(GDA)
and another class to implement Naive Bayes (NB) algorithm from scratch. Test
your models with the dataset.
2. Using the dataset given, compare Gaussian Discriminant Analysis and Naive
Bayes with Logistic regression using a different size of the dataset. Use the
following sizes of the data for comparison; 10% of the data, 30% of the data, 60% of the data, 100% of the data

For each of the sizes, write your observation and show the reports of the comparison on the algorithms.

In [45]:
import pandas as pd
import numpy as np
import matplotlib as plt
import math as m
import scipy
from scipy import stats
import sklearn
from sklearn.model_selection import train_test_split , learning_curve
from textblob import TextBlob
from sklearn.preprocessing import OneHotEncoder

In [46]:
class GDAnalysis(object):
    
    def __init__(self,theta = None, mu = None, cov = None,prediction = None):
        self.theat = theta
        self.mu = mu
        self.cov = cov
        self.prediction = prediction
        
    def prior_probability(self,target):
        theta = {}
        for c in target.unique():
            m = (target==c).mean()
            theta[c]=m
        return theta

    def estimated_means(self,X,target):
        mean = {}
        Data = X.copy()
        feat = X.columns
        for c in target.unique():
            dc = Data[target==c]
            mc = dc.mean(axis =0)
            mean[c] = mc.values
        return mean

    def cov_matrix(self,X,target):
        Data = X.copy()
        #Data['targ'] = target
        feat = X.columns
        m = self.estimated_means(X,target)
        for v in target.unique():
            Data[target==v] = Data[target==v] - m[v]
        cov = np.dot(Data.T,Data)/len(X)
        self.inv_cov = np.linalg.inv(cov)
        return cov
    
    def gauss_density(self,x):
        """pdf of the multivariate normal distribution."""
        x_m = x - np.array(list(self.mu.values()))
        return np.exp(-1 * np.sum(x_m.dot(self.inv_cov) * x_m, axis=1)) * 0.5 * np.array(list(self.theta.values()))
    
    def fit(self,X,target):
        self.theta = self.prior_probability(target) 
        self.mu = self.estimated_means(X,target)
        self.cov = self.cov_matrix(X,target)
        return self.theta , self.mu , self.cov
    def predict(self,X):
        self.prediction = np.apply_along_axis(self.get_prediction, 1, X)
        self.y_pred =  np.argmax(self.prediction, axis=1)
        return self.y_pred
    
    def get_prediction(self, x): return self.gauss_density(x)
    
    def accuracy(self, y_test):
       
        return (y_test==self.y_pred).mean()

In [47]:
class MCLogisticReg(object):
    def __ini__(self, max_iter=5000, lr=0.05):
        self.sel_max = sel_max
        self.lr = lr
        
    def sigmoid(self,z):
        return 1 / (1 + np.exp(-z))
    
    def cost_function(self,theta, x, y):
        self.cost = 1 / len(y) * np.sum(-y * np.log(self.sigmoid(x.dot(theta))) - (1 - y) * np.log(1 - self.sigmoid(x.dot(theta))))
        self.grad = 1 / len(y) * ((y - self.sigmoid(x.dot(theta))).dot(x))
        return self.cost, self.grad
    def fit_array(self,x, y, max_iter=5000, alpha=0.1):
        x = np.insert(x, 0, 1, axis=1)
        self.thetas = []
        self.classes = np.unique(y)
        self.costs = np.zeros(max_iter)

        for c in self.classes:
            # one vs. rest binary classification
            binary_y = np.where(y == c, 1, 0)

            theta = np.zeros(x.shape[1])
            for epoch in range(max_iter):
                self.costs[epoch], grad = self.cost_function(theta, x, binary_y)
                theta += alpha * grad

            self.thetas.append(theta)
        return self.thetas, self.classes, self.costs
    def fit(self,x,y):
        if isinstance(x, pd.DataFrame):
            x =x.values
            y =y.values
            return self.fit_array(x, y)
        else:
            return self.fit_array(x,y)
    def predict(self,x):
        if isinstance(x, pd.DataFrame):
            x =x.values
            x = np.insert(x, 0, 1, axis=1)
            preds = [np.argmax([self.sigmoid(xi.dot(theta)) for theta in self.thetas]) for xi in x]
            self.y_pred =  [self.classes[p] for p in preds]
            return self.y_pred
        else:
            x = np.insert(x, 0, 1, axis=1)
            preds = [np.argmax([self.sigmoid(xi.dot(theta)) for theta in self.thetas]) for xi in x]
            self.y_pred =  [self.classes[p] for p in preds]
            return self.y_pred
    def accuracy(self,y):
        return (self.y_pred == y).mean()

In [48]:
class NaiveBayes_multiN(object):
    def __init__(self, alpha=1.0):
        self.alpha = alpha

    def fit_array(self, X, y):
        count_sample = X.shape[0]
        separated = [[x for x, t in zip(X, y) if t == c] for c in np.unique(y)]
        self.class_log_prior_ = [np.log(len(i) / count_sample) for i in separated]
        count = np.array([np.array(i).sum(axis=0) for i in separated]) + self.alpha
        self.feature_log_prob_ = np.log(count / count.sum(axis=1)[np.newaxis].T)
        return self
    def fit (self,x, y):
        if isinstance(x, pd.DataFrame):
            x =x.values
            y =y.values
            return self.fit_array(x, y)
        else:
            return self.fit_array(x, y)
    def predict_log_proba(self, X):
        if isinstance(X, pd.DataFrame):
            X = X.values
            return [(self.feature_log_prob_ * x).sum(axis=1) + self.class_log_prior_
                for x in X]
        else:
            return [(self.feature_log_prob_ * x).sum(axis=1) + self.class_log_prior_
                for x in X]
            

    def predict(self, X):
        self.y_pred = np.argmax(self.predict_log_proba(X), axis=1)
        return self.y_pred
    def accuracy(self,y):
        return (self.y_pred == y).mean()

In [49]:
GDA = GDAnalysis()

In [50]:
mcl = MCLogisticReg()

In [51]:
NB = NaiveBayes_multiN()

# Test for the GDA and the Naive Bayes with the data
In this test we combine the datasets, train both of the models with 70% of the data and test them with the remaining 30%

In [52]:
train = pd.read_csv('Data/drugLibTrain_raw.tsv',sep = '\t')
test = pd.read_csv('Data/drugLibTest_raw.tsv',sep = '\t')
df = pd.concat([train,test])
df = df.dropna()


In [53]:
df.head()

Unnamed: 0.1,Unnamed: 0,urlDrugName,rating,effectiveness,sideEffects,condition,benefitsReview,sideEffectsReview,commentsReview
0,2202,enalapril,4,Highly Effective,Mild Side Effects,management of congestive heart failure,slowed the progression of left ventricular dys...,"cough, hypotension , proteinuria, impotence , ...","monitor blood pressure , weight and asses for ..."
1,3117,ortho-tri-cyclen,1,Highly Effective,Severe Side Effects,birth prevention,Although this type of birth control has more c...,"Heavy Cycle, Cramps, Hot Flashes, Fatigue, Lon...","I Hate This Birth Control, I Would Not Suggest..."
2,1146,ponstel,10,Highly Effective,No Side Effects,menstrual cramps,I was used to having cramps so badly that they...,Heavier bleeding and clotting than normal.,I took 2 pills at the onset of my menstrual cr...
3,3947,prilosec,3,Marginally Effective,Mild Side Effects,acid reflux,The acid reflux went away for a few months aft...,"Constipation, dry mouth and some mild dizzines...",I was given Prilosec prescription at a dose of...
4,1951,lyrica,2,Marginally Effective,Severe Side Effects,fibromyalgia,I think that the Lyrica was starting to help w...,I felt extremely drugged and dopey. Could not...,See above


For some features like the sideEffects and effectiveness, we are going to numerically encode them. Since each of those two features have only 5 labels then we create a function to perform OnehotEncoding on them.

In [54]:

def onehotenc(data,features):
    for feature in features:
        for l in data[feature].unique():
            data[l] = (data[feature]==l).astype(int)
    return data
df= onehotenc(df,['sideEffects','effectiveness'])
Data = df.drop(['sideEffects','effectiveness','Unnamed: 0', 'urlDrugName'],axis = 1)

In [55]:
print(Data.shape)

(4132, 15)


In [56]:
Data.head()

Unnamed: 0,rating,condition,benefitsReview,sideEffectsReview,commentsReview,Mild Side Effects,Severe Side Effects,No Side Effects,Extremely Severe Side Effects,Moderate Side Effects,Highly Effective,Marginally Effective,Ineffective,Considerably Effective,Moderately Effective
0,4,management of congestive heart failure,slowed the progression of left ventricular dys...,"cough, hypotension , proteinuria, impotence , ...","monitor blood pressure , weight and asses for ...",1,0,0,0,0,1,0,0,0,0
1,1,birth prevention,Although this type of birth control has more c...,"Heavy Cycle, Cramps, Hot Flashes, Fatigue, Lon...","I Hate This Birth Control, I Would Not Suggest...",0,1,0,0,0,1,0,0,0,0
2,10,menstrual cramps,I was used to having cramps so badly that they...,Heavier bleeding and clotting than normal.,I took 2 pills at the onset of my menstrual cr...,0,0,1,0,0,1,0,0,0,0
3,3,acid reflux,The acid reflux went away for a few months aft...,"Constipation, dry mouth and some mild dizzines...",I was given Prilosec prescription at a dose of...,1,0,0,0,0,0,1,0,0,0
4,2,fibromyalgia,I think that the Lyrica was starting to help w...,I felt extremely drugged and dopey. Could not...,See above,0,1,0,0,0,0,1,0,0,0


## Gaussian Discriminant Analysis
For the Gaussian Discriminant Analysis we are going to do some sentimental analysis in order to get get numerical information from the text feature. Here we use the library textblob that element give a score of how positive or negative is each text.

In [57]:
data = Data.copy()
for feat in [var for var in data.columns if data[var].dtypes=='O']:
    data[feat] = data[feat].apply(lambda x: TextBlob(x).sentiment[0])

In [58]:
data.head()

Unnamed: 0,rating,condition,benefitsReview,sideEffectsReview,commentsReview,Mild Side Effects,Severe Side Effects,No Side Effects,Extremely Severe Side Effects,Moderate Side Effects,Highly Effective,Marginally Effective,Ineffective,Considerably Effective,Moderately Effective
0,4,-0.316667,-0.147222,-0.316667,0.0,1,0,0,0,0,1,0,0,0,0
1,1,0.0,0.55,0.029545,-0.8,0,1,0,0,0,1,0,0,0,0
2,10,0.0,-0.078571,0.15,-0.040741,0,0,1,0,0,1,0,0,0,0
3,3,0.0,-0.091667,0.022222,0.0,1,0,0,0,0,0,1,0,0,0
4,2,0.0,0.0,-0.125,0.0,0,1,0,0,0,0,1,0,0,0


In [59]:
Y_train = data['rating']
X_train = data.drop('rating',axis =1)
x_train, x_test , y_train , y_test = train_test_split(X_train,Y_train,test_size = 0.3, random_state = 11)

In [60]:
GDA.fit(x_train,y_train)

({5: 0.05186721991701245,
  1: 0.1016597510373444,
  10: 0.23029045643153526,
  4: 0.03423236514522822,
  3: 0.04495159059474412,
  6: 0.05359612724757953,
  9: 0.15179806362378975,
  7: 0.11825726141078838,
  2: 0.03561549100968188,
  8: 0.177731673582296},
 {5: array([ 0.00611111,  0.06956376, -0.00222536,  0.05378849,  0.20666667,
          0.26666667,  0.14666667,  0.00666667,  0.37333333,  0.18      ,
          0.16      ,  0.04      ,  0.24      ,  0.38      ]),
  1: array([ 0.01187075,  0.03034855, -0.04104617,  0.02549438,  0.0170068 ,
          0.31972789,  0.06802721,  0.50340136,  0.09183673,  0.10204082,
          0.14285714,  0.53061224,  0.09863946,  0.12585034]),
  10: array([0.00355993, 0.11843238, 0.05044717, 0.07765548, 0.34384384,
         0.01351351, 0.6036036 , 0.0015015 , 0.03753754, 0.92192192,
         0.0015015 , 0.0015015 , 0.07057057, 0.0045045 ]),
  4: array([ 0.00299404,  0.09806295, -0.03842745,  0.04609647,  0.16161616,
          0.32323232,  0.09090909, 

In [61]:
GDA.predict(x_test)

array([9, 9, 9, ..., 7, 2, 2])

In [62]:
GDA.accuracy(y_test)

0.1693548387096774

We have noticed that the does not perform well with the data because the accuracy 17% is too small for such a classification problem. We must focus on the feature ingeneering and selection in order to get better accuracy.

## Naive Bayes
For the naive we are going ingeneer the text data differently as what we did for the gaussian discriminant analysis. Since the Naive Bayes model work with discret variables, we gonna use countVectorizer to select some words in the order text data as new feature and their values will be the number of times the words appear in each observation row. Before that some data cleaning based on text preprocessing has been done. we used methods from the *nltk* library to preprocess the text data.

In [63]:
df = Data.copy()
df['my_doc'] = df['condition'] + df['benefitsReview']+ df['sideEffectsReview'] + df['commentsReview']
df = df.drop(['condition','benefitsReview','sideEffectsReview','commentsReview'] , axis =True)
df.head()

Unnamed: 0,rating,Mild Side Effects,Severe Side Effects,No Side Effects,Extremely Severe Side Effects,Moderate Side Effects,Highly Effective,Marginally Effective,Ineffective,Considerably Effective,Moderately Effective,my_doc
0,4,1,0,0,0,0,1,0,0,0,0,management of congestive heart failureslowed t...
1,1,0,1,0,0,0,1,0,0,0,0,birth preventionAlthough this type of birth co...
2,10,0,0,1,0,0,1,0,0,0,0,menstrual crampsI was used to having cramps so...
3,3,1,0,0,0,0,0,1,0,0,0,acid refluxThe acid reflux went away for a few...
4,2,0,1,0,0,0,0,1,0,0,0,fibromyalgiaI think that the Lyrica was starti...


In [64]:
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
stemmer = SnowballStemmer("english")
import string
from nltk import pos_tag
from nltk.tokenize import WhitespaceTokenizer

In [65]:
def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [66]:
def clean_text(text):
    # lower text
    text = text.lower()
    # tokenize text and remove puncutation
    text = [word.strip(string.punctuation) for word in text.split(" ")]
    # remove words that contain numbers
    text = [word for word in text if not any(c.isdigit() for c in word)]
    # remove stop words
    stop = stopwords.words('english')
    text = [x for x in text if x not in stop]
    # remove empty tokens
    text = [t for t in text if len(t) > 0]
    # pos tag text
    pos_tags = pos_tag(text)
    # lemmatize text
    text = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]
    # remove words with only one letter
    text = [t for t in text if len(t) > 1]
    # join all
    text = " ".join(text)
    return(text)

In [67]:
df['clean_doc'] = df['my_doc'].apply(lambda x: clean_text(x))

In [68]:
df.head()

Unnamed: 0,rating,Mild Side Effects,Severe Side Effects,No Side Effects,Extremely Severe Side Effects,Moderate Side Effects,Highly Effective,Marginally Effective,Ineffective,Considerably Effective,Moderately Effective,my_doc,clean_doc
0,4,1,0,0,0,0,1,0,0,0,0,management of congestive heart failureslowed t...,management congestive heart failureslowed prog...
1,1,0,1,0,0,0,1,0,0,0,0,birth preventionAlthough this type of birth co...,birth preventionalthough type birth control co...
2,10,0,0,1,0,0,1,0,0,0,0,menstrual crampsI was used to having cramps so...,menstrual crampsi use cramp badly would leave ...
3,3,1,0,0,0,0,0,1,0,0,0,acid refluxThe acid reflux went away for a few...,acid refluxthe acid reflux go away month day d...
4,2,0,1,0,0,0,0,1,0,0,0,fibromyalgiaI think that the Lyrica was starti...,fibromyalgiai think lyrica start help pain sid...


In [69]:
from sklearn.feature_extraction.text import CountVectorizer 
countVect = CountVectorizer(max_features=100, lowercase=True) 

In [70]:
countVect.fit(df['clean_doc'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=100, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [71]:
result_test = countVect.transform(df['clean_doc'])

In [72]:
print(result_test.toarray().shape)

(4132, 100)


In [73]:
countVect.get_feature_names()

['able',
 'acne',
 'also',
 'anxiety',
 'back',
 'bad',
 'become',
 'begin',
 'benefit',
 'blood',
 'cause',
 'change',
 'control',
 'could',
 'daily',
 'day',
 'depression',
 'doctor',
 'dosage',
 'dose',
 'drug',
 'dry',
 'due',
 'effect',
 'effective',
 'even',
 'every',
 'experience',
 'face',
 'feel',
 'felt',
 'find',
 'first',
 'get',
 'give',
 'go',
 'good',
 'headache',
 'help',
 'high',
 'hour',
 'however',
 'increase',
 'know',
 'less',
 'level',
 'life',
 'like',
 'long',
 'loss',
 'low',
 'make',
 'medication',
 'mg',
 'mild',
 'month',
 'morning',
 'much',
 'need',
 'never',
 'night',
 'notice',
 'one',
 'pain',
 'per',
 'period',
 'pill',
 'prescribe',
 'problem',
 'really',
 'reduce',
 'see',
 'seem',
 'severe',
 'side',
 'since',
 'skin',
 'sleep',
 'start',
 'still',
 'stomach',
 'stop',
 'symptom',
 'tablet',
 'take',
 'the',
 'think',
 'time',
 'treatment',
 'try',
 'two',
 'use',
 'week',
 'weight',
 'well',
 'within',
 'without',
 'work',
 'would',
 'year']

In [74]:
vectdf = pd.DataFrame(result_test.toarray(), columns=countVect.get_feature_names())
final_data = df.join(vectdf)
final_data = final_data.drop(['my_doc','clean_doc'] , axis =1)

In [75]:
final_data.head()

Unnamed: 0,rating,Mild Side Effects,Severe Side Effects,No Side Effects,Extremely Severe Side Effects,Moderate Side Effects,Highly Effective,Marginally Effective,Ineffective,Considerably Effective,...,two,use,week,weight,well,within,without,work,would,year
0,4,1,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
0,9,1,0,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
1,1,0,1,0,0,0,1,0,0,0,...,0,2,0,0,1,0,0,0,1,0
1,9,1,0,0,0,0,1,0,0,0,...,0,2,0,0,1,0,0,0,1,0
2,10,0,0,1,0,0,1,0,0,0,...,0,1,0,0,0,0,0,1,1,0


In [76]:
Y_train = final_data['rating']
X_train = final_data.drop('rating',axis =1)

x_train, x_test , y_train , y_test = train_test_split(X_train,Y_train,test_size = 0.3, random_state = 11)

In [77]:
NB.fit(x_train,y_train)

<__main__.NaiveBayes_multiN at 0x7f0f76513518>

In [78]:
NB.predict(x_test)

array([9, 9, 8, ..., 6, 2, 6])

In [79]:
NB.accuracy(y_test)

0.16048387096774194

After running the Gaussian discriminant analysis and the Naive Bayes models on the dataset we can see that both of the give almost the same accuracy. However the scores are very slow for both of them, so we can continue working on transforming our features in order to get move more predictive variables.

# GDA and Naive Bayes vs Logistics Regression 

In [80]:
def comparison(model1,model2,my_data):
    dfr = my_data.copy()
    y = dfr['rating']
    X = dfr.drop('rating',axis =1)
    train_size = [0.1,0.3,0.6]
    scor1 = []
    scor2 = []
    size_tr = []
    summary = pd.DataFrame()
    for i in train_size:
        x_train, x_test , y_train , y_test = train_test_split(X,y,test_size = (1-i), random_state = 42)
        model1.fit(x_train,y_train)
        model1.predict(x_test)
        
        model2.fit(x_train,y_train)
        model2.predict(x_test)
        
        size_tr.append(i*100)
        scor1.append(model1.accuracy(y_test))
        scor2.append(model2.accuracy(y_test))
    summary['train_size'] = size_tr
    summary[str(model1)] = scor1
    summary[str(model2)] = scor2
    
    return summary
    

## GDA vs Logistics Regession

In [81]:
GDA_vs_LG = comparison(GDA,mcl,data)

In [82]:
GDA_vs_LG

Unnamed: 0,train_size,<__main__.GDAnalysis object at 0x7f0f76513198>,<__main__.MCLogisticReg object at 0x7f0f76513358>
0,10.0,0.0363,0.454154
1,30.0,0.030418,0.457656
2,60.0,0.059891,0.46582


In [83]:
dfr = data.copy()
y = dfr['rating']
X = dfr.drop('rating',axis =1)
L = [100]
a = GDA.fit(X,y)
b = GDA.predict(X)
L.append(GDA.accuracy(y))
c = mcl.fit(X,y)
d = mcl.predict(X)
L.append(mcl.accuracy(y))
GDA_vs_LG.loc[4] = L

In [84]:
GDA_vs_LG

Unnamed: 0,train_size,<__main__.GDAnalysis object at 0x7f0f76513198>,<__main__.MCLogisticReg object at 0x7f0f76513358>
0,10.0,0.0363,0.454154
1,30.0,0.030418,0.457656
2,60.0,0.059891,0.46582
4,100.0,0.152953,0.469264


For each size of our training data the Logistic Regression gives better accuracy than the Gaussian Discriminant Analysi. Therefore, we can said that the logistics regression is more performent than the GDA.

## Naive Bayes vs Logistic Regresion

In [85]:
NB_vs_LG = comparison(NB,mcl,final_data)

In [86]:
NB_vs_LG

Unnamed: 0,train_size,<__main__.NaiveBayes_multiN object at 0x7f0f76513518>,<__main__.MCLogisticReg object at 0x7f0f76513358>
0,10.0,0.161871,0.33665
1,30.0,0.172831,0.385759
2,60.0,0.160315,0.434967


In [87]:
dfr = final_data.copy()
y = dfr['rating']
X = dfr.drop('rating',axis =1)
L = [100]
a = NB.fit(X,y)
b = NB.predict(X)
L.append(NB.accuracy(y))
c = mcl.fit(X,y)
d = mcl.predict(X)
L.append(mcl.accuracy(y))
NB_vs_LG.loc[4] = L

In [88]:
NB_vs_LG

Unnamed: 0,train_size,<__main__.NaiveBayes_multiN object at 0x7f0f76513518>,<__main__.MCLogisticReg object at 0x7f0f76513358>
0,10.0,0.161871,0.33665
1,30.0,0.172831,0.385759
2,60.0,0.160315,0.434967
4,100.0,0.152469,0.511133


Like our previous comparison we can see that the logistic regression performs better that the Naive Bayes for each train data size.

# Conclusion
In this project we have coding from scratch the Gaussian Discriminant Analysis model, the Naive Bayes model and the multiclasses logistic regression.
At the first time we have compared the GDA and Naive Bayes model and i saw that the Naive Bayes perform better.
At the second time, we compared both GDA and Naive Bayes with the Logistics regression by using different size of data. we noticed that for each size of data the Logistics regression performed better than Naive Bayes and GDA.