In [5]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB, MultinomialNB

<h1> PART A </h1>

In [6]:
data = pd.read_csv('Lab2_dataset.csv')

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data['text'])

y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

#Support Vector Machine (SVM) Classifier

svc = SVC()
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
svc_accuracy = accuracy_score(y_test, y_pred)
print("SVC accuracy:", svc_accuracy)

# Gaussian Naive Bayes Classifier
gnb = GaussianNB()
gnb.fit(X_train.toarray(), y_train)
y_pred = gnb.predict(X_test.toarray())
gnb_accuracy = accuracy_score(y_test, y_pred)
print("GaussianNB accuracy:", gnb_accuracy)

# Multinomial Naive Bayes Classifier
mnb = MultinomialNB()
mnb.fit(X_train, y_train)
y_pred = mnb.predict(X_test)
mnb_accuracy = accuracy_score(y_test, y_pred)
print("MultinomialNB accuracy:", mnb_accuracy)

SVC accuracy: 0.9559164733178654
GaussianNB accuracy: 0.9597834493426141
MultinomialNB accuracy: 0.9775715390564579


<h1> PART B </h1>

In [7]:
data = pd.read_csv('AB_NYC_2019.csv')

# Z-score approach to remove outliers
z_value = np.abs((data['price'] - data['price'].mean()) / data['price'].std())
data_z = data[z_value < 3]

# IQR approach to remove outliers
def lab2_partb(data, price):
    q1 = data[price].quantile(0.25)
    q3 = data[price].quantile(0.75)
    iqr = q3 - q1

    # Calculate the upper and lower bounds for outliers
    left_whisker = q1 - 1.5 * iqr
    right_whisker = q3 + 1.5 * iqr

    # Remove outliers
    final_data = data[(data[price] >= left_whisker) & (data[price] <= right_whisker)]
    
    return final_data

data_iqr = lab2_partb(data, 'price')

# Print the number of data points before and after removing outliers for both approaches
print('Actual data before removing outliers: ', len(data))
print('Data after removing outliers using Z-score approach: ', len(data_z))
print('Data after removing outliers using IQR approach: ', len(data_iqr))


Actual data before removing outliers:  48895
Data after removing outliers using Z-score approach:  48507
Data after removing outliers using IQR approach:  45923


<h2> 
It is observed that z-score approach removed 350-400 outliers which means there is less chances of data loss.

But IQR approach has majorly removed upto 5000 outliers which could result in significant data loss.

</h2>