Skeleton Code

In [None]:
import requests
from bs4 import BeautifulSoup
import zipfile
import io
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
import time

# Function to download and load dataset
def load_data():
    page_url = 'https://archive.ics.uci.edu/dataset/240/human+activity+recognition+using+smartphones'
    page_response = requests.get(page_url)
    if page_response.status_code == 200:
        soup = BeautifulSoup(page_response.content, 'html.parser')
        download_link = soup.select_one('a[href$=".zip"]')['href']
        full_download_url = 'https://archive.ics.uci.edu' + download_link
        response = requests.get(full_download_url)
        if response.status_code == 200:
            with zipfile.ZipFile(io.BytesIO(response.content)) as outer_zip:
                inner_zip_name = 'UCI HAR Dataset.zip'
                with outer_zip.open(inner_zip_name) as inner_zip_file:
                    with zipfile.ZipFile(io.BytesIO(inner_zip_file.read())) as inner_zip:
                        with inner_zip.open('UCI HAR Dataset/train/X_train.txt') as myfile:
                            df = pd.read_csv(myfile, delim_whitespace=True, header=None)
                        with inner_zip.open('UCI HAR Dataset/train/y_train.txt') as myfile_y:
                            y = pd.read_csv(myfile_y, delim_whitespace=True, header=None)
    else:
        raise Exception("Failed to download or parse the dataset.")
    return df, y

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.cluster import KMeans
import numpy as np
import time

# Load dataset
df, y = load_data()

#TASK 1 - DO EDA and understand a little about the data.
#Only important thing is to know that it has a lot of features that don't make sense, just a
#bunch of readings from sensors.
#We think many of these features are redundant or irrelevant, and we want to find good features.


  df = pd.read_csv(myfile, delim_whitespace=True, header=None)
  y = pd.read_csv(myfile_y, delim_whitespace=True, header=None)


In [None]:
# Check the shape of the dataset
print(f"Shape of X: {df.shape}")
print(f"Shape of y: {y.shape}")

# Preview the data
print("First few rows of X:")
print(df.head())

print("First few rows of y:")
print(y.head())

Shape of X: (7352, 561)
Shape of y: (7352, 1)
First few rows of X:
        0         1         2         3         4         5         6    \
0  0.288585 -0.020294 -0.132905 -0.995279 -0.983111 -0.913526 -0.995112   
1  0.278419 -0.016411 -0.123520 -0.998245 -0.975300 -0.960322 -0.998807   
2  0.279653 -0.019467 -0.113462 -0.995380 -0.967187 -0.978944 -0.996520   
3  0.279174 -0.026201 -0.123283 -0.996091 -0.983403 -0.990675 -0.997099   
4  0.276629 -0.016570 -0.115362 -0.998139 -0.980817 -0.990482 -0.998321   

        7         8         9    ...       551       552       553       554  \
0 -0.983185 -0.923527 -0.934724  ... -0.074323 -0.298676 -0.710304 -0.112754   
1 -0.974914 -0.957686 -0.943068  ...  0.158075 -0.595051 -0.861499  0.053477   
2 -0.963668 -0.977469 -0.938692  ...  0.414503 -0.390748 -0.760104 -0.118559   
3 -0.982750 -0.989302 -0.938692  ...  0.404573 -0.117290 -0.482845 -0.036788   
4 -0.979672 -0.990441 -0.942469  ...  0.087753 -0.351471 -0.699205  0.123320   

 

In [None]:
# Summary statistics for features
print("Statistical Summary of Features:")
print(df.describe())


Statistical Summary of Features:
               0            1            2            3            4    \
count  7352.000000  7352.000000  7352.000000  7352.000000  7352.000000   
mean      0.274488    -0.017695    -0.109141    -0.605438    -0.510938   
std       0.070261     0.040811     0.056635     0.448734     0.502645   
min      -1.000000    -1.000000    -1.000000    -1.000000    -0.999873   
25%       0.262975    -0.024863    -0.120993    -0.992754    -0.978129   
50%       0.277193    -0.017219    -0.108676    -0.946196    -0.851897   
75%       0.288461    -0.010783    -0.097794    -0.242813    -0.034231   
max       1.000000     1.000000     1.000000     1.000000     0.916238   

               5            6            7            8            9    ...  \
count  7352.000000  7352.000000  7352.000000  7352.000000  7352.000000  ...   
mean     -0.604754    -0.630512    -0.526907    -0.606150    -0.468604  ...   
std       0.418687     0.424073     0.485942     0.414122     0

In [None]:
# Check for missing values
missing_values = df.isnull().sum().sum()
print(f"Total Missing Values: {missing_values}")


Total Missing Values: 0


In [None]:
# Check the distribution of y (activity labels)
print("Target Variable Distribution:")
print(y[0].value_counts())


Target Variable Distribution:
0
6    1407
5    1374
4    1286
1    1226
2    1073
3     986
Name: count, dtype: int64


In [None]:
# Compute the correlation matrix
correlation_matrix = df.corr()

# Find highly correlated features (absolute correlation > 0.9)
high_corr = np.where(np.abs(correlation_matrix) > 0.9)
high_corr_pairs = [(correlation_matrix.index[x], correlation_matrix.columns[y])
                   for x, y in zip(*high_corr) if x != y]
print(f"Highly Correlated Feature Pairs: {high_corr_pairs}")


Highly Correlated Feature Pairs: [(3, 4), (3, 6), (3, 7), (3, 9), (3, 12), (3, 13), (3, 15), (3, 16), (3, 19), (3, 83), (3, 84), (3, 86), (3, 87), (3, 89), (3, 92), (3, 95), (3, 96), (3, 99), (3, 100), (3, 102), (3, 103), (3, 104), (3, 135), (3, 165), (3, 168), (3, 181), (3, 200), (3, 201), (3, 202), (3, 203), (3, 205), (3, 206), (3, 207), (3, 213), (3, 214), (3, 215), (3, 216), (3, 218), (3, 219), (3, 220), (3, 226), (3, 227), (3, 228), (3, 229), (3, 231), (3, 233), (3, 234), (3, 239), (3, 244), (3, 265), (3, 266), (3, 268), (3, 269), (3, 271), (3, 272), (3, 274), (3, 280), (3, 281), (3, 284), (3, 285), (3, 287), (3, 288), (3, 289), (3, 302), (3, 310), (3, 314), (3, 344), (3, 345), (3, 347), (3, 348), (3, 350), (3, 351), (3, 353), (3, 359), (3, 360), (3, 363), (3, 364), (3, 366), (3, 367), (3, 368), (3, 389), (3, 393), (3, 423), (3, 425), (3, 429), (3, 438), (3, 444), (3, 447), (3, 502), (3, 503), (3, 504), (3, 505), (3, 507), (3, 508), (3, 509), (3, 510), (3, 515), (3, 516), (3, 517)

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
encoded_y = label_encoder.fit_transform(y.values.ravel())

# Check the first few encoded labels
print("Encoded Labels:")
print(encoded_y[:10])


Encoded Labels:
[4 4 4 4 4 4 4 4 4 4]


In [None]:
from sklearn.preprocessing import StandardScaler

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the features
df_scaled = scaler.fit_transform(df)

# Convert the scaled features back to a DataFrame (optional, for easier inspection)
df_scaled = pd.DataFrame(df_scaled, columns=df.columns)

# Check the first few rows of the scaled data
print("Scaled Features:")
print(df_scaled.head())


Scaled Features:
        0         1         2         3         4         5         6    \
0  0.200642 -0.063683 -0.419628 -0.868814 -0.939441 -0.737529 -0.859817   
1  0.055948  0.031486 -0.253908 -0.875426 -0.923902 -0.849304 -0.868531   
2  0.073515 -0.043416 -0.076295 -0.869039 -0.907760 -0.893785 -0.863137   
3  0.066696 -0.208422 -0.249712 -0.870626 -0.940022 -0.921805 -0.864503   
4  0.030469  0.027587 -0.109848 -0.875188 -0.934878 -0.921343 -0.867384   

        7         8         9    ...       551       552       553       554  \
0 -0.939019 -0.766437 -0.856036  ... -0.795359  0.025960 -0.276399 -0.360603   
1 -0.921998 -0.848928 -0.871359  ...  0.130614 -0.897357 -0.767990  0.133011   
2 -0.898854 -0.896701 -0.863323  ...  1.152336 -0.260878 -0.438316 -0.377840   
3 -0.938124 -0.925279 -0.863323  ...  1.112769  0.591045  0.463155 -0.135025   
4 -0.931789 -0.928028 -0.870260  ... -0.149577 -0.138515 -0.240313  0.340406   

        555       556       557       558       559

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train_full, X_test_full, y_train, y_test = train_test_split(
    df_scaled, encoded_y, test_size=0.2, random_state=42, stratify=encoded_y)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# Step 1: Create a pipeline
pipeline = Pipeline([
    ('classifier', GaussianNB())
])

# Step 2: Fit the model to the training data
pipeline.fit(X_train_full, y_train)

# Step 3: Predict values for the test set
y_pred = pipeline.predict(X_test_full)

# Step 4: Print the accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy Score: {accuracy:.4f}")


Accuracy Score: 0.7587


In [None]:
# Step 1: Note the start time
start_time = time.time()

# Step 2: Create and fit the pipeline
pipeline = Pipeline([
    ('classifier', GaussianNB())
])
pipeline.fit(X_train_full, y_train)

# Step 3: Predict values for the test set
y_pred = pipeline.predict(X_test_full)

# Step 4: Note the end time
end_time = time.time()

# Step 5: Calculate the time taken
time_taken = end_time - start_time
print(f"Time taken for model training and inference: {time_taken:.4f} seconds")

# Step 6: Print the accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy Score: {accuracy:.4f}")

Time taken for model training and inference: 0.3553 seconds
Accuracy Score: 0.7587


In [None]:
# Define the number of clusters (e.g., select based on domain knowledge or testing)
n_clusters = 10  # You can adjust this number based on the dataset

# Initialize K-Means
kmeans = KMeans(n_clusters=n_clusters, random_state=42)

# Fit K-Means to the transposed dataset to treat features as data points
kmeans.fit(df_scaled.T)

# Find the representative feature indices (one feature per cluster)
selected_features_indices = []
for cluster in range(n_clusters):
    cluster_indices = np.where(kmeans.labels_ == cluster)[0]
    # Select the first feature in the cluster as representative
    selected_features_indices.append(cluster_indices[0])

# Extract the selected features
selected_features = df_scaled.iloc[:, selected_features_indices]

# Print the selected feature indices
print(f"Selected feature indices: {selected_features_indices}")
print(f"Shape of selected features: {selected_features.shape}")


Selected feature indices: [26, 3, 41, 12, 16, 65, 43, 18, 0, 70]
Shape of selected features: (7352, 10)


In [None]:
# Split the reduced dataset into training and testing sets
X_train_reduced, X_test_reduced, y_train_reduced, y_test_reduced = train_test_split(
    selected_features, y, test_size=0.2, random_state=42, stratify=y
)

# Record the start time
start_time = time.time()

# Train a Gaussian Naive Bayes model
pipeline_reduced = Pipeline([
    ('classifier', GaussianNB())
])
pipeline_reduced.fit(X_train_reduced, y_train_reduced)

# Predict on the reduced test set
y_pred_reduced = pipeline_reduced.predict(X_test_reduced)

# Record the end time
end_time = time.time()

# Calculate the time taken
time_taken_reduced = end_time - start_time

# Calculate accuracy
accuracy_reduced = accuracy_score(y_test_reduced, y_pred_reduced)

# Print results
print(f"Time taken for training and inference on reduced dataset: {time_taken_reduced:.4f} seconds")
print(f"Accuracy on reduced dataset: {accuracy_reduced:.4f}")


Time taken for training and inference on reduced dataset: 0.0808 seconds
Accuracy on reduced dataset: 0.7288


  y = column_or_1d(y, warn=True)
