In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
# Load the dataset
df = pd.read_csv("Lab2_dataset.csv")

In [4]:
# Transform the "text" feature using CountVectorizer
vectorizer = CountVectorizer(max_features=1000) # the size is arbitrary, adjust as needed
X = vectorizer.fit_transform(df["text"]).toarray()
y = df["label"]  # Assuming there's a column called "label"

In [5]:
# Split the dataset into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.metrics import accuracy_score

In [6]:
# Train the Sklearn SVC model
svc_model = SVC()
svc_model.fit(X_train, y_train)
svc_pred = svc_model.predict(X_test)

In [7]:
# Gaussian Naive Bayes
gnb_model = GaussianNB()
gnb_model.fit(X_train, y_train)
gnb_pred = gnb_model.predict(X_test)

In [8]:
# Multinomial Naive Bayes
mnb_model = MultinomialNB()
mnb_model.fit(X_train, y_train)
mnb_pred = mnb_model.predict(X_test)

In [9]:
# Evaluate Models
print(f"SVC Accuracy: {accuracy_score(y_test, svc_pred)}")
print(f"GaussianNB Accuracy: {accuracy_score(y_test, gnb_pred)}")
print(f"MultinomialNB Accuracy: {accuracy_score(y_test, mnb_pred)}")


SVC Accuracy: 0.9497584541062802
GaussianNB Accuracy: 0.9420289855072463
MultinomialNB Accuracy: 0.9342995169082126


In [10]:
import pandas as pd
import numpy as np
from scipy.stats import zscore

In [11]:
# Load the dataset
df = pd.read_csv("AB_NYC_2019.csv")

In [12]:
# Z-score approach for outlier removal
z_scores = np.abs(zscore(df["price"]))
df_z = df[(z_scores < 3)]  # Threshold of 3 is typical but can be adjusted

In [13]:
# IQR (whiskers) approach for outlier removal
Q1 = df["price"].quantile(0.25)
Q3 = df["price"].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df_iqr = df[(df["price"] >= lower_bound) & (df["price"] <= upper_bound)]

In [14]:
# Print out the results
print(f"Original dataset length: {len(df)}")
print(f"Dataset length after Z-score filtering: {len(df_z)}")
print(f"Dataset length after IQR filtering: {len(df_iqr)}")

Original dataset length: 48895
Dataset length after Z-score filtering: 48507
Dataset length after IQR filtering: 45923


In [15]:
# Save filtered dataframes to new CSV files (optional)
df_z.to_csv("filtered_by_zscore.csv", index=False)
df_iqr.to_csv("filtered_by_iqr.csv", index=False)
