<a href="https://colab.research.google.com/github/2023aiml537/capstone_har_model/blob/main/Human_Activity_Recognition_Using_Smartphones_Mid_term.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Importing necessary libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from tensorflow.keras.layers import Dense,Input, Embedding,LSTM,Dropout,Conv1D, MaxPooling1D, GlobalMaxPooling1D,Dropout,Bidirectional,Flatten,BatchNormalization
from tensorflow.keras.preprocessing import sequence
import tensorflow as tf
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

### **Importing data.** Since we are using google colab and data is in google drive so we need to allow colab to read data.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

MessageError: Error: credential propagation was unsuccessful

#### Loading data into data frame.

In [None]:
df = pd.read_csv('/content/drive/MyDrive/AI_ML_Project/dataset/train.csv')

In [None]:
df.head()

In [None]:
# Current train set size.
df.shape

#### Current train data set dimensions : (7352, 563)

In [None]:
# Separate features and target
X_train = df.drop('Activity', axis=1)
y_train = df['Activity']

In [None]:
X_train.head()

In [None]:
y_train.head()

### Loading test data set.

In [None]:
df_test = pd.read_csv('/content/drive/MyDrive/AI_ML_Project/dataset/test.csv')

In [None]:
df_test.head()

In [None]:
df_test.shape

In [None]:
# Separate features and target
X_test = df_test.drop('Activity', axis=1)
y_test = df_test['Activity']

In [None]:
X_test.head()

In [None]:
y_test.head()

## **Feature Engineering**

#### **Duplicate Features** : Different features (or columns) in a dataset that have the same or very similar values. These redundant features can lead to inefficiencies in model training and can also cause overfitting, where the model learns noise in the data rather than the underlying patterns.

In [None]:
def get_duplicate_columns(df):

    duplicate_columns = {}
    seen_columns = {}

    for column in df.columns:
        current_column = df[column]

        # Convert column data to bytes
        try:
            current_column_hash = current_column.values.tobytes()
        except AttributeError:
            current_column_hash = current_column.to_string().encode()
        if current_column_hash in seen_columns:
            if seen_columns[current_column_hash] in duplicate_columns:
                duplicate_columns[seen_columns[current_column_hash]].append(column)
            else:
                duplicate_columns[seen_columns[current_column_hash]] = [column]
        else:
            seen_columns[current_column_hash] = column

    return duplicate_columns

In [None]:
duplicate_columns = get_duplicate_columns(X_train)
duplicate_columns

#### Taking a look into the duplicate column values.

In [None]:
X_train[['tBodyAccMag-mean()','tBodyAccMag-sma()','tGravityAccMag-mean()','tGravityAccMag-sma()']]

In [None]:
X_train[['tBodyAccMag-std()','tBodyAccMag-std()']]

#### We can see that most of the data in various columns across the data set is duplicated, we can drop the redundant column.

In [None]:
for duplicate_columns_list in duplicate_columns.values():
    X_train.drop(columns=duplicate_columns_list,inplace=True)
    X_test.drop(columns=duplicate_columns_list,inplace=True)

In [None]:
print(X_train.shape)
print(X_test.shape)

#### After removing duplicate columns the data set reduced from 562 columns to 541 columns.

### **Variance Threshold** : It is used to remove features with low variance, which means they do not vary much and thus do not provide useful information for explaining the variation in the output.This method is applied to two types of features:
 - Constant Features: These are features where all values are the same.
 - Quasi-Constant Features: These features have the same value for the vast majority of rows (e.g., 995 out of 1000 rows have the same value), with only a few different values.

In [None]:
sel = VarianceThreshold(threshold=0.05)
sel.fit(X_train)

In [None]:
sel.get_support()

In [None]:
columns = X_train.columns[sel.get_support()]
columns

In [None]:
X_train = sel.transform(X_train)
X_test = sel.transform(X_test)

X_train = pd.DataFrame(X_train, columns=columns)
X_test = pd.DataFrame(X_test, columns=columns)

In [None]:
print(X_train.shape)
print(X_test.shape)

In [None]:
X_train.head()

### **Pearson Correlation** in Removing Multicollinearity

In [None]:
sns.heatmap(X_train.corr())

In [None]:
corr_matrix = X_train.corr()
corr_matrix

In [None]:
def get_correlated_columns(corr_matrix):
  # Get the column names of the DataFrame
  columns = corr_matrix.columns

  # Create an empty list to keep track of columns to drop
  columns_to_drop = []

  # Loop over the columns
  for i in range(len(columns)):
      for j in range(i + 1, len(columns)):
          # Access the cell of the DataFrame
          if corr_matrix.loc[columns[i], columns[j]] > 0.95:
              columns_to_drop.append(columns[j])

  return columns_to_drop

In [None]:
columns_to_drop = get_correlated_columns(corr_matrix)
columns_to_drop = set(columns_to_drop)
len(columns_to_drop)

In [None]:
X_train.drop(columns = columns_to_drop, axis = 1, inplace=True)
X_test.drop(columns = columns_to_drop, axis = 1, inplace=True)

In [None]:
print(X_train.shape)
print(X_test.shape)

### **ANOVA Test**

In [None]:
sel = SelectKBest(f_classif, k=100).fit(X_train, y_train)

# display selected feature names
X_train.columns[sel.get_support()]

In [None]:
columns = X_train.columns[sel.get_support()]

X_train = sel.transform(X_train)
X_test = sel.transform(X_test)

X_train = pd.DataFrame(X_train, columns=columns)
X_test = pd.DataFrame(X_test, columns=columns)

In [None]:
X_train.head()

### Eventually after feature engineering the total feature is reduced from 563 to 100.

#### Columns that are considered for model building

In [None]:
index=0
for column in X_train.columns:
  print(index,' - ',column)
  index+=1

In [None]:
train_df=pd.DataFrame(data, columns=column)

## **Building models** using above features.

In [None]:
le = LabelEncoder()
le.fit(y_train)
y_train = le.transform(y_train)
y_test = le.transform(y_test)

### Logistic Regression

In [None]:
# Initialize and train logistic regression model
log_reg = LogisticRegression(max_iter=1000)  # Increase max_iter if it doesn't converge
log_reg.fit(X_train, y_train)

# Make predictions on the test set
y_pred = log_reg.predict(X_test)

# Calculate and print accuracy score
accuracy = accuracy_score(y_test, y_pred)

print("Test accuracy:", accuracy)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
confusion_matrix(y_test,y_pred)
pd.crosstab(y_test, y_pred, rownames = ['Actual'], colnames =['Predicted'], margins = True)

In [None]:
width = 6
height = 6
n_classes = 6
# Output classes to learn how to classify
LABELS = le.classes_
plt.figure(figsize=(width, height))
plt.imshow(
    confusion_matrix(y_test,y_pred),
    interpolation='nearest',
    cmap=plt.cm.rainbow
)
plt.title("Confusion matrix")
plt.colorbar()
tick_marks = np.arange(n_classes)
plt.xticks(tick_marks, LABELS, rotation=90)
plt.yticks(tick_marks, LABELS)
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()