In [None]:
import requests
from bs4 import BeautifulSoup
import zipfile
import io
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
import time

# Function to download and load dataset
def load_data():
    page_url = 'https://archive.ics.uci.edu/dataset/240/human+activity+recognition+using+smartphones'
    page_response = requests.get(page_url)
    if page_response.status_code == 200:
        soup = BeautifulSoup(page_response.content, 'html.parser')
        download_link = soup.select_one('a[href$=".zip"]')['href']
        full_download_url = 'https://archive.ics.uci.edu' + download_link
        response = requests.get(full_download_url)
        if response.status_code == 200:
            with zipfile.ZipFile(io.BytesIO(response.content)) as outer_zip:
                inner_zip_name = 'UCI HAR Dataset.zip'
                with outer_zip.open(inner_zip_name) as inner_zip_file:
                    with zipfile.ZipFile(io.BytesIO(inner_zip_file.read())) as inner_zip:
                        with inner_zip.open('UCI HAR Dataset/train/X_train.txt') as myfile:
                            df = pd.read_csv(myfile, delim_whitespace=True, header=None)
                        with inner_zip.open('UCI HAR Dataset/train/y_train.txt') as myfile_y:
                            y = pd.read_csv(myfile_y, delim_whitespace=True, header=None)
    else:
        raise Exception("Failed to download or parse the dataset.")
    return df, y

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.cluster import KMeans
import numpy as np
import time

# Load dataset
df, y = load_data()


#TASK 1 - DO EDA and understand a little about the data.
#Only important thing is to know that it has a lot of features that don't make sense, just a
#bunch of readings from sensors.
#We think many of these features are redundant or irrelevant, and we want to find good features.
# Display basic information about the dataset
import matplotlib.pyplot as plt
import seaborn as sns

# Display basic info about the dataset
print("Feature dataset shape:", df.shape)
print("Target dataset shape:", y.shape)

# Display the first few rows of the feature dataset
print("First few rows of features:")
print(df.head())

# Display the first few rows of the target dataset
print("First few rows of target:")
print(y.head())

# Check for missing values
print("\nMissing values in the feature dataset:\n", df.isnull().sum().sum())
print("Missing values in the target dataset:\n", y.isnull().sum().sum())

# Summary statistics of the features
print("\nSummary statistics of features:")
print(df.describe())

# Plot the target variable distribution
plt.figure(figsize=(8, 6))
sns.countplot(x=y[0], palette="viridis")
plt.title("Distribution of Target Variable")
plt.xlabel("Activity")
plt.ylabel("Count")
plt.show()

# Check correlations among features (if feasible with a subset)
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, cmap='coolwarm', cbar=True)
plt.title("Feature Correlation Heatmap (Sample)")
plt.show()

In [None]:
# Task 2: Encode class labels
# YOUR CODE HERE: Use LabelEncoder to encode class labels
label_encoder = LabelEncoder()
encoded_y = label_encoder.fit_transform(y[0]) # YOUR CODE HERE# Initialize LabelEncoder
#encoded_y = label.encoder.fit_transform(y.values.ravel())
print(encoded_y)

In [None]:
# Task 3: Scale the features using StandardScaler
# YOUR CODE HERE: Apply StandardScaler to df

scaler =  StandardScaler()# YOUR CODE HERE
df_scaled =  scaler.fit_transform(df)# YOUR CODE HEREfrom sklearn.preprocessing import StandardScaler



# Convert the scaled data back to a DataFrame (optional, for better readability)
df_scaled = pd.DataFrame(df_scaled, columns=df.columns)


In [None]:
# Task 4: Split the data into training and testing sets
# YOUR CODE HERE: Use train_test_split to split the data
X_train_full, X_test_full, y_train, y_test = train_test_split(
    df_scaled, encoded_y, test_size=0.2, random_state=42, stratify=encoded_y) # YOUR CODE HERE

# Display the shapes of the resulting datasets
print("Training features shape:", X_train_full.shape)
print("Testing features shape:", X_test_full.shape)
print("Training labels shape:", y_train.shape)
print("Testing labels shape:", y_test.shape)


In [None]:
#TASK 5 - 1. Create a pipeline using Gaussian Naive Bayes
#         2. Fit the model to the training data
#         3. Predict values for test set
#         4. Print accuracy score
#from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
#from sklearn.metrics import accuracy_score

# Step 1: Create a pipeline with Gaussian Naive Bayes
pipeline = Pipeline([('classifier', GaussianNB())])

# Step 2: Fit the model to the training data
pipeline.fit(X_train_full, y_train)

# Step 3: Predict values for the test set
y_pred = pipeline.predict(X_test_full)

# Step 4: Print the accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy score: {accuracy:.4f}")

In [None]:
#TASK 6 - 1. Note the start time before defining the pipeline
#         2. Note the end time and report the difference as the time taken by the model training and inference.
import time

# Step 1: Note the start time
start_time = time.time()

# Step 2: Create a pipeline with Gaussian Naive Bayes
pipeline = Pipeline([('classifier', GaussianNB())])

# Step 3: Fit the model to the training data
pipeline.fit(X_train_full, y_train)

# Step 4: Predict values for the test set
y_pred = pipeline.predict(X_test_full)

# Step 5: Note the end time
end_time = time.time()

# Step 6: Calculate and print the time taken
time_taken = end_time - start_time
print(f"Time taken for model training and inference: {time_taken:.4f} seconds")

# Step 7: Print the accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy score: {accuracy:.2f}")




In [None]:
# TASK 7 - K-Means for dimensionality reduction
n_clusters = 5
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(df_scaled.T)  # Transpose to treat features as data points
#selected_features_indices = np.unique(np.argmin(kmeans.transform(df_scaled.T), axis=1))
selected_features_indices = [np.random.choice(np.where(kmeans.labels_ == i)[0]) for i in range(n_clusters)]
selected_features =  df.iloc[:, selected_features_indices]
print(selected_features)





In [None]:
from sklearn.naive_bayes import GaussianNB
# Split the data with selected features
X_train, X_test, y_train, y_test = train_test_split(selected_features, encoded_y, test_size=0.2, random_state=42)


#Train Gaussian Naive Bayes on all features and measure time
X_train_full, X_test_full, y_train, y_test = train_test_split(df_scaled, encoded_y, test_size=0.2, random_state=42, stratify=encoded_y)
start_time = time.time()
classifier_pipeline = Pipeline([('classifier', GaussianNB())])
classifier_pipeline.fit(X_train_full, y_train)
y_pred = classifier_pipeline.predict(X_test_full)
end_time = time.time()
full_features_time = end_time - start_time
accuracy_full = accuracy_score(y_test, y_pred)

# Train Gaussian Naive Bayes on selected features and measure time
start_time = time.time()
classifier_pipeline = Pipeline([('classifier', GaussianNB())])
classifier_pipeline.fit(X_train, y_train)
y_pred = classifier_pipeline.predict(X_test)
end_time = time.time()
selected_features_time = end_time - start_time

# Evaluate accuracy
accuracy_selected_features = accuracy_score(y_test, y_pred)

# Print results
print(f"Time taken with all features: {full_features_time:.4f} seconds")
print(f"Accuracy with all features: {accuracy_full:.4f}")
print(f"Time taken with selected features: {selected_features_time:.4f} seconds")
print(f"Accuracy with selected features: {accuracy_selected_features:.4f}")