# Importing Library

In [None]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from sklearn.manifold import TSNE
from collections import Counter
from sklearn.metrics import accuracy_score , confusion_matrix ,classification_report
from sklearn.preprocessing import StandardScaler
from datetime import datetime
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC ,SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier

# Loading Data

In [None]:
features = list()
with open("UCI_HAR_Dataset/features.txt") as f:
    features = [ line.split("\n")[0] for line in f.readlines()]
print(len(features))
features[0:5]

In [None]:
len(set(features))

In [None]:
Xtrain = pd.read_csv("UCI_HAR_Dataset/train/X_train.txt", delim_whitespace = True , header = None , names= features)
Xtrain["subject"] = pd.read_csv("UCI_HAR_Dataset/train/subject_train.txt" , header = None , squeeze = True)

Ytrain = pd.read_csv("UCI_HAR_Dataset/train/Y_train.txt" ,names = ["Activity"], header = None , squeeze = True)
Ytrain_labels = Ytrain.map({1: 'WALKING', 2:'WALKING_UPSTAIRS',3:'WALKING_DOWNSTAIRS',\
                       4:'SITTING', 5:'STANDING',6:'LAYING'})

Train = Xtrain
Train["Activity"] = Ytrain
Train["Activity_name"] = Ytrain_labels

Train.sample()

In [None]:
Train.shape

In [None]:
Xtest = pd.read_csv("UCI_HAR_Dataset/test/X_test.txt", delim_whitespace = True , header = None , names= features)
Xtest["subject"] = pd.read_csv("UCI_HAR_Dataset/test/subject_test.txt" , header = None , squeeze = True)

Ytest = pd.read_csv("UCI_HAR_Dataset/test/Y_test.txt" ,names = ["Activity"], header = None , squeeze = True)
Ytest_labels = Ytest.map({1: 'WALKING', 2:'WALKING_UPSTAIRS',3:'WALKING_DOWNSTAIRS',\
                       4:'SITTING', 5:'STANDING',6:'LAYING'})

Test = Xtest
Test["Activity"] = Ytest
Test["Activity_name"] = Ytest_labels

Test.sample()

In [None]:
Test.shape

# Data Preprocessing

In [None]:
print("No. of duplicated values in Train :",sum(Train.duplicated()))
print("No. of duplicated values in Test :",sum(Test.duplicated()))

In [None]:
print("Np. of Nan or null values in Train :" , Train.isnull().values.sum())
print("No. of Nan or null values in Test :", Test.isnull().values.sum())

# Checking data imbalanced

In [None]:
sns.set_theme(style = "darkgrid")
plt.figure(figsize = (16,8))
plt.title("Data provided by each user",fontsize = 20)
sns.countplot(x = "subject",hue = "Activity_name",data = Train)
plt.show()

In [None]:
plt.figure(figsize = (10,5))
plt.title("No. of data per activity",fontsize = 20)
sns.countplot(x = Train.Activity_name)
plt.xticks(rotation = 90)
plt.show()

# Changing columns name

In [None]:
columns = Train.columns

# Removing '()' from column names
columns = columns.str.replace('[()]','')
columns = columns.str.replace('[-]', '')
columns = columns.str.replace('[,]','')

Train.columns = columns
Test.columns = columns

Test.columns

In [None]:
for i in columns:
    if i.find("tBodyAccMagmean")>= 0 :
        print(i)

# EDA

In [None]:
sns.FacetGrid(Train,hue = "Activity_name",height = 6 , aspect = 2)\
.map(sns.distplot,'201 tBodyAccMagmean',hist = False)\
.add_legend()

plt.annotate("Stationary Activities",xy = (-0.9,15),\
             xytext = (-0.8,15),va = "center",ha = "left",size = 20,\
             arrowprops = dict(arrowstyle = "simple",connectionstyle = "arc3,rad= 0.1"))

plt.annotate("Moving Activities",xy = (0,3),xytext = (0.3,6),\
            va="center",ha = "left",size = 20, arrowprops = dict(arrowstyle= "simple",connectionstyle = "arc3,rad = 0.1"))
plt.show()

# TSNE

In [None]:
x_tsne = Train.drop(["subject","Activity","Activity_name"],axis = 1)
y_tsne = Train.Activity_name

print(x_tsne.shape , y_tsne.shape)

In [None]:

def perform_tsne(xdata , ydata , perplexity , n_iter = 2000):
    for i in perplexity:
        print("Performing TSNE with perplexity",i)
        tsne = TSNE(perplexity = i).fit_transform(xdata)
        
        print("Plotting visualisation of TSNE")
        dataset = pd.DataFrame(tsne,columns = ["x","y"])
        dataset["Activity"] = ydata
        sns.FacetGrid(dataset,hue = "Activity",height = 7).map(plt.scatter,"x","y").add_legend()
        plt.savefig("TSNE with perplexity" + str(i))
        plt.show()
 
perform_tsne(x_tsne,y_tsne,[20,30,40,50],3000)

# ML Model

In [None]:
Xtrain = Train.drop(["subject","Activity","Activity_name"],axis = 1)
Ytrain = Train.Activity

Xtest = Test.drop(["subject","Activity","Activity_name"] , axis = 1)
Ytest = Test.Activity

Activity_labels = Ytrain_labels

Scaler = StandardScaler()

Xtrain = Scaler.fit_transform(Xtrain)
Xtest = Scaler.fit_transform(Xtest)

print(Xtrain.shape , Ytrain.shape)
print(Xtest.shape , Ytest.shape)

In [None]:
def model_run(model,xtrain,ytrain,xtest,ytest):
    model = model.fit(xtrain,ytrain)
    ypred = model.predict(xtest)
    print("\n\n\n")
    print("Best Estimator\n")
    print(model.best_estimator_)
    print("\n\n\n")
    print("Best Parameter\n")
    print(model.best_params_)
    print("\n\n\n")
    print("Best Score\n")
    print(model.best_score_)
    print("\n\n\n")
    print("Accuracy score",accuracy_score(ytest, ypred))
    print("\n\n\n")
    print("Confusion matrix\n", confusion_matrix(ytest,ypred))
    print("\n\n\n")
    print("Classfication report\n")
    print(classification_report(ytest,ypred))
    print("\n\n\n")
    
 

# Logistic Regression

In [None]:
model = LogisticRegression(max_iter = 1000)
params = {"C" : [0.01,0.1,1,10,20,30],"penalty" : ["l1","l2"]}

grid_search = GridSearchCV(model,param_grid = params , cv =3,verbose = 1 , n_jobs = - 1)

model_run(grid_search,Xtrain,Ytrain,Xtest,Ytest)

# Linear SVM Classifier

In [None]:
model = LinearSVC(max_iter = 2000,tol= 0.00005)
params = {"C": [0.01,0.1,1,10,20,30],"penalty":["l1","l2"]}

grid_search = GridSearchCV(model,param_grid = params ,cv =3 ,verbose = 1 , n_jobs = -1)

model_run(grid_search , Xtrain ,Ytrain, Xtest , Ytest)

# Kernel SVM Classifier

In [None]:
model = SVC(kernel = "rbf")
params = { "C": [0.1,1,2,4] , "gamma" :[0.0078,0.01,1,1.125,2]}

grid_search = GridSearchCV(model, param_grid = params , cv= 3 , verbose = 1 , n_jobs = -1)
model_run(grid_search , Xtrain , Ytrain , Xtest , Ytest)

In [None]:
#May improve more

model = SVC(kernel = "rbf")
params = { "C": [1.8,2,2.2] , "gamma" :[0.001,0.003,0.006]}

grid_search = GridSearchCV(model, param_grid = params , cv= 3 , verbose = 1 , n_jobs = -1)
model_run(grid_search , Xtrain , Ytrain , Xtest , Ytest)

# Decision Tree Classifier

In [None]:
model = DecisionTreeClassifier()
params = {"max_depth" : np.arange(3,8)}

grid_search = GridSearchCV(model,param_grid = params, n_jobs = -1)
model_run(grid_search , Xtrain , Ytrain , Xtest , Ytest)

# Random Forest Classifier

In [None]:
model = RandomForestClassifier(n_jobs = -1)
params = {"n_estimators" : np.arange(100,300,20)  , "max_depth" : np.arange(3,15,2) }

grid_search = GridSearchCV(model,param_grid = params)
model_run(grid_search , Xtrain , Ytrain , Xtest , Ytest)

# Gradient Boosting Classifier

In [None]:
model = GradientBoostingClassifier()
params = {"n_estimators" : np.arange(130,170,10)  , "max_depth" : np.arange(5,8,1) }

grid_search = GridSearchCV(model,param_grid = params,n_jobs = -1)
model_run(grid_search , Xtrain , Ytrain , Xtest , Ytest)

# Conclusion

Logistic Regression , Linear SVC and Kernel SVC will be most accurate