# Import library

In [1]:
import numpy as np 
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns


In [2]:

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# Load The train dataset

In [3]:
df_train=pd.read_csv('/kaggle/input/human-activity-recognition-with-smartphones/train.csv')

In [4]:
df_train.head()

In [5]:
df_train.tail()

# Now EDA Part Starting

In [6]:
df_train.shape

In [7]:
df_train.isnull().sum()

In [8]:
df_train.describe()

In [9]:
#checking for duplicates

print('Number of duplicate entries in the dataset {}'.format(sum(df_train.duplicated())))

# Class Distribution

In [10]:
df_train['Activity'].unique()

# Now Visualize the class Distribution

In [11]:
# Plotting data with respect to subject
sns.set_style('whitegrid')
plt.figure(figsize=(20,10))
plt.title('Observations per User', fontsize=20)
sns.countplot(x='subject', hue='Activity', data=df_train)
plt.plot()

In [12]:
plt.figure(figsize=(12,6))
axis=sns.countplot(x="Activity",data=df_train)
plt.xticks(x=df_train['Activity'],rotation='vertical')
plt.show()

# Now Subject Part

In [13]:
df_train['subject'].unique()

In [14]:
X=pd.DataFrame(df_train.drop(['Activity','subject'],axis=1))
y=df_train.Activity.values.astype(object)

In [15]:
X.shape , y.shape

In [16]:
X.head()

In [17]:
y[5]

In [18]:
X.info()

In [19]:
#Total Number of Continous and Categorical features in the training set
num_cols = X._get_numeric_data().columns
print("Number of numeric features:",num_cols.size)

# Transforming Non numerical Labels into numerical labels

In [20]:
from sklearn import preprocessing

In [21]:
encoder=preprocessing.LabelEncoder()

In [22]:
encoder.fit(y)
y=encoder.transform(y)
y.shape

In [23]:
y[5]

In [24]:
encoder.classes_

In [25]:
encoder.classes_[5]

# Feature Scaling

In [26]:
from sklearn.preprocessing import StandardScaler

In [27]:
scaler=StandardScaler()

In [28]:
X=scaler.fit_transform(X)

In [29]:
X[5]

# Now Split X and y 
# Training and Validation sets

In [30]:
from sklearn.model_selection import train_test_split

In [31]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=100)

In [32]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

# Now Train The Model

1. SVM
2. Random Forest
3. Confusion Matrix 
4. Decision Tree



In [33]:

# import SVC classifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# import metrics to compute accuracy (Evulate)
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report
from sklearn.model_selection import cross_val_score, GridSearchCV

# Firstly SVM

# Run SVM with default hyperparameters

In [34]:
svc=SVC()

In [35]:
svc.fit(X_train,y_train)

In [36]:
y_pred=svc.predict(X_test)

# Default SVC Score

In [37]:
# compute and print accuracy score
print('Model accuracy score with default hyperparameters: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

# Now Some Hyperparameter where kernel use rbf and C=100.0

In [38]:
svc2=SVC(kernel='rbf',C=100.0)


# fit classifier to training set
svc2.fit(X_train,y_train)

# make predictions on test set
y_pred2 = svc2.predict(X_test)

# compute and print accuracy score
print('Model accuracy score with rbf kernel and C=100.0 : {0:0.4f}'. format(accuracy_score(y_test, y_pred2)))

# Now Random Forest Classifier

In [39]:
rand_clf=RandomForestClassifier(random_state=5)

In [40]:
rand_clf.fit(X_train,y_train)

In [41]:
# compute and print accuracy score
rand_clf.score(X_test,y_test)

In [42]:
#confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

In [43]:
# Accuracy
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

In [44]:
# Recall
from sklearn.metrics import recall_score
recall_score(y_test, y_pred, average=None)

In [45]:
# Precision
from sklearn.metrics import precision_score
precision_score(y_test, y_pred, average=None)

In [46]:
#calculate F1 Score
from sklearn.metrics import f1_score
f1_score(y_test, y_pred, average=None)

In [47]:
#DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier
dtree=DecisionTreeClassifier()
dtree.fit(X_train,y_train)

In [48]:
# Predicting the values of test data
y_pred = dtree.predict(X_test)
print("Classification report - \n", classification_report(y_test,y_pred))

# Hyperparameters of Random Forest Classifier:

# These are the parameters

1. max_depth: The max_depth of a tree in Random Forest is defined as the longest path between the root node and the leaf node.
2. min_sample_split: Parameter that tells the decision tree in a random forest the minimum required number of observations in any given node to split it. Default = 2
3. max_leaf_nodes: This hyperparameter sets a condition on the splitting of the nodes in the tree and hence restricts the growth of the tree.
4. min_samples_leaf: This Random Forest hyperparameter specifies the minimum number of samples that should be present in the leaf node after splitting a node. Default = 1
5. n_estimators: Number of trees in the forest.
6. max_sample: The max_samples hyperparameter determines what fraction of the original dataset is given to any individual tree.
7. max_features: This resembles the number of maximum features provided to each tree in a random forest.
8. bootstrap: Method for sampling data points (with or without replacement). Default = True
9. criterion: The function to measure the quality of a split. Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain.

# Now, manually setting the hyperparameters, and using GridSearchCV for Hyperparameter Tuning:


In [49]:
'''
grid_param={
    'n_estimators':[90,100,115,130],
    'criterion':['gini','entropy'],
    'max_depth':range(2,20,1),
    'min_samples_leaf':range(1,10,1),
    'min_samples_split':range(2,10,1),
    'max_features':['auto','log2']
}
'''

In [50]:
#grid_search=GridSearchCV(estimator=rand_clf,param_grid=grid_param,cv=5,n_jobs=-1,verbose=3)

In [51]:
#grid_search.fit(X_train,y_train)

In [52]:
#grid_search.best_params_

In [53]:
#rand_clf=RandomForestClassifier()