 # Lab 2 - Probability and Statistics

# Part A

# Preprocessing

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.metrics import accuracy_score

## Load the dataset

In [34]:
df = pd.read_csv("Lab2_dataset.csv")

## Split the dataset into training and testing

In [35]:
X = df["text"]
y = df["label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
df

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\nth...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\n( see a...",0
2,3624,ham,"Subject: neon retreat\nho ho ho , we ' re arou...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\nthis deal is to ...,0
...,...,...,...,...
5166,1518,ham,Subject: put the 10 on the ft\nthe transport v...,0
5167,404,ham,Subject: 3 / 4 / 2000 and following noms\nhpl ...,0
5168,2933,ham,Subject: calpine daily gas nomination\n>\n>\nj...,0
5169,1409,ham,Subject: industrial worksheets for august 2000...,0


## Use the CountVectorizer function in sklearn to transform the "text" feature to a vector representation of a predetermined size.

In [36]:
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Model Training and Evaluation

## Train the Sklearn SVC model on the training dataset and evaluate on the test set

In [37]:
svc_model = SVC()
svc_model.fit(X_train_vectorized, y_train)
svc_predictions = svc_model.predict(X_test_vectorized)
svc_accuracy = accuracy_score(y_test, svc_predictions)
print("SVC Accuracy:", svc_accuracy)

SVC Accuracy: 0.9652173913043478


## Train and evaluate also on the Gaussian Naiive Bayes Classifiers

In [38]:
gaussian_nb_model = GaussianNB()
gaussian_nb_model.fit(X_train_vectorized.toarray(), y_train)
gaussian_nb_predictions = gaussian_nb_model.predict(X_test_vectorized.toarray())
gaussian_nb_accuracy = accuracy_score(y_test, gaussian_nb_predictions)
print("Gaussian Naive Bayes Accuracy:", gaussian_nb_accuracy)

Gaussian Naive Bayes Accuracy: 0.9545893719806763


## Train and evaluate also on the Multinomial Naiive Bayes Classifiers

In [39]:
multinomial_nb_model = MultinomialNB()
multinomial_nb_model.fit(X_train_vectorized, y_train)
multinomial_nb_predictions = multinomial_nb_model.predict(X_test_vectorized)
multinomial_nb_accuracy = accuracy_score(y_test, multinomial_nb_predictions)
print("Multinomial Naive Bayes Accuracy:", multinomial_nb_accuracy)

Multinomial Naive Bayes Accuracy: 0.9719806763285024


# Part B

## Load Dataset

In [40]:
import pandas as pd
import numpy as np
df = pd.read_csv("AB_NYC_2019.csv")
df

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.94190,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.10,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48890,36484665,Charming one bedroom - newly renovated rowhouse,8232441,Sabrina,Brooklyn,Bedford-Stuyvesant,40.67853,-73.94995,Private room,70,2,0,,,2,9
48891,36485057,Affordable room in Bushwick/East Williamsburg,6570630,Marisol,Brooklyn,Bushwick,40.70184,-73.93317,Private room,40,4,0,,,2,36
48892,36485431,Sunny Studio at Historical Neighborhood,23492952,Ilgar & Aysel,Manhattan,Harlem,40.81475,-73.94867,Entire home/apt,115,10,0,,,1,27
48893,36485609,43rd St. Time Square-cozy single bed,30985759,Taz,Manhattan,Hell's Kitchen,40.75751,-73.99112,Shared room,55,1,0,,,6,2


## Compare the Z-score approach and the whiskers approach in terms of who is better to remove the outliers in this case.

In [45]:
from scipy import stats
z_scores = np.abs(stats.zscore(df["price"]))
threshold = 3
outlier_mask_zscore = z_scores > threshold
Q1 = df["price"].quantile(0.25)
Q3 = df["price"].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df_clean_zscore = df[~outlier_mask_zscore]
df_clean_whiskers = df[~outlier_mask_whiskers]
print("Original dataset size:", df.shape)
print("Size of cleaned dataset using Z-score approach:", df_clean_zscore.shape)
print("Size of cleaned dataset using Whiskers approach:", df_clean_whiskers.shape)

Original dataset size: (48895, 16)
Size of cleaned dataset using Z-score approach: (48507, 16)
Size of cleaned dataset using Whiskers approach: (45923, 16)


In [43]:
print("Head of cleaned dataset using Z-score approach:")
df_clean_zscore.head()
print("Head of cleaned dataset using Whiskers approach:")
df_clean_whiskers.head()

Head of cleaned dataset using Z-score approach:
Head of cleaned dataset using Whiskers approach:


Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0
