In [1]:
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import nltk
from nltk import stem
from nltk.corpus import stopwords
stemmer = stem.SnowballStemmer('english')
stopwords = set(stopwords.words('english'))
import string
np.seterr(divide = 'ignore') 
import plotly.graph_objects as go
from plotly.offline import iplot, init_notebook_mode

## Gaussian naive Bayes

In [2]:
def pdf(class_idx, x, mean, var):
    mean = mean[class_idx]
    var = var[class_idx]
    numerator = np.exp(- (x-mean)**2 / (2 * var))
    denominator = np.sqrt(2 * np.pi * var)
    return numerator / denominator

In [3]:
def GaussNaiveBayes(X,y):
    n_samples, n_features = X.shape
    classes = np.unique(y)
    n_classes = len(classes)
    mean = np.zeros((n_classes, n_features), dtype=np.float64)
    var = np.zeros((n_classes, n_features), dtype=np.float64)
    priors =  np.zeros(n_classes, dtype=np.float64)
    for idx, c in enumerate(classes):
        X_c = X[y==c]
        mean[idx, :] = X_c.mean(axis=0)
        var[idx, :] = X_c.var(axis=0)
        priors[idx] = X_c.shape[0] / float(n_samples)
    return mean, var, priors, classes

In [4]:
def predict(X, classes, mean, var, priors):
    y_pred = [_predict(x, classes, mean, var, priors) for x in X]
    return np.array(y_pred)
def _predict(x, classes, mean, var, priors):
    posteriors = []
    for idx, c in enumerate(classes):
        prior = np.log(priors[idx])
        posterior = np.sum(np.log(pdf(idx, x, mean, var)))
        posterior = prior + posterior
        posteriors.append(posterior)          
    return classes[np.argmax(posteriors)]

In [5]:
data = pd.read_csv('spam.csv')
X = data[data.columns.difference(['label'])].values
y = data.label.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
mean, var, priors, classes=GaussNaiveBayes(X_train, y_train)
predictions = predict(X_test, classes, mean, var, priors)
print("Gaussian naive Bayes classification accuracy", accuracy_score(y_test, predictions))

Gaussian naive Bayes classification accuracy 0.8208469055374593


## Multinomial naive Bayes

In [6]:
def probability(test_point,class_, count):     
    log_prob = np.log(count[class_]['total_points']) - np.log(count['total_points'])
    total_words = len(test_point)
    for i in range(len(test_point)):
        current_word_prob = test_point[i]*(np.log(count[class_][i]+1)-np.log(count[class_]['total']+total_words))
        log_prob += current_word_prob      
    return log_prob

In [7]:
def MultinomialNaiveBayes(X, y):
    count = {}     
    classes = set(y)
    for class_ in classes:
        count[class_] = {}
        for i in range(len(X[0])):
            count[class_][i] = 0
        count[class_]['total'] = 0
        count[class_]['total_points'] = 0
    count['total_points'] = len(X)
    for i in range(len(X)):
        for j in range(len(X[0])):
            count[y[i]][j]+=X[i][j]
            count[y[i]]['total']+=X[i][j]
        count[y[i]]['total_points']+=1
    return count, classes

In [8]:
def predict_new(X, classes, count):
    y_pred = [] 
    for i in range(len(X)):
        y_pred.append(_predict_new(X[i], classes, count))      
    return y_pred
def _predict_new(test_point, classes, count): 
    best_class = None
    best_prob = None
    first_run = True      
    for class_ in classes:
        log_probability_current_class = probability(test_point,class_, count)
        if (first_run) or (log_probability_current_class > best_prob) :
            best_class = class_
            best_prob = log_probability_current_class
            first_run = False
                
    return best_class

In [9]:
data = pd.read_csv('smsspam.csv')
X = data[data.columns.difference(['label'])].values
y = data.label.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
X_train=list(X_train)
vocab = {}
for i in range(len(X_train)):
    word_list = []
    for word in X_train[i][0].split():
        word_new  = word.strip(string.punctuation).lower()
        if (len(word_new)>2)  and (word_new not in stopwords):  
            if word_new in vocab:
                vocab[word_new]+=1
            else:
                vocab[word_new]=1

In [11]:
features = []
for key in vocab:
    features.append(key)

In [12]:
X_train_dataset = np.zeros((len(X_train),len(features)))
for i in range(len(X_train)):
    word_list = [ word.strip(string.punctuation).lower() for word in X_train[i][0].split()]
    for word in word_list:
        if word in features:
            X_train_dataset[i][features.index(word)] += 1

In [13]:
X_test_dataset = np.zeros((len(X_test),len(features)))
for i in range(len(X_test)):
    word_list = [ word.strip(string.punctuation).lower() for word in X_test[i][0].split()]
    for word in word_list:
        if word in features:
            X_test_dataset[i][features.index(word)] += 1

In [14]:
count, classes=MultinomialNaiveBayes(X_train_dataset,y_train)
y_pred = predict_new(X_test_dataset, classes, count)
print("Multinomial naive Bayes classification accuracy:",accuracy_score(y_pred,y_test))

Multinomial naive Bayes classification accuracy: 0.9802690582959641


## Global  Search with Monte Carlo

In [21]:
data = pd.read_csv('tsp.csv', header=None)
data.columns = ['vertex', 'x', 'y']
n = data.shape[0]
G = np.zeros([n, n])
for i in np.arange(n):
    for j in np.arange(n):
        if i != j:
            G[i, j] = np.sum(np.abs(data[data.vertex == (i+1)][['x', 'y']].values - data[data.vertex == (j+1)][['x', 'y']].values))
length=0
vertexes = np.arange(G.shape[0])
best_route = None
best_length = None
for i in np.arange(100000):
    route = vertexes.copy()
    np.random.shuffle(route)
    for i in range(0, len(route)-1):
        length+= G[route[i], route[i+1]]
    if not best_length or lenght < best_lenght:
        best_route = route
        best_lenght = length
    length=0
print("Route length:", best_lenght)
track = []
track.append(go.Scatter(x=data.x, y=data.y, mode='markers', marker=dict(size=14, color='black')))
start_x, start_y = data[data.vertex == (best_route[0]+1)][['x', 'y']].values[0]
stop_x, stop_y = data[data.vertex == (best_route[-1]+1)][['x', 'y']].values[0]
track.append(go.Scatter(x=[start_x, stop_x], y=[start_y, stop_y],mode='markers', marker=dict(size=20, color='red')))
track.append(go.Scatter(x=data.x, y=data.y, mode='markers', marker=dict(size=14, color='black')))
for i in range(0, len(best_route)-1):
    from_x, from_y = data[data.vertex == (best_route[i]+1)][['x', 'y']].values[0]
    to_x, to_y = data[data.vertex == (best_route[i+1]+1)][['x', 'y']].values[0]
    track.append(go.Scatter(x=[from_x, to_x, to_x], y=[from_y, from_y, to_y], line=dict(color='green')))
fig = go.Figure(track, layout=go.Layout(width=500, height=500, showlegend=False))
fig.update_xaxes(title='x')
fig.update_yaxes(title='y')
iplot(fig)

Route length: 33555.0
