In [None]:
import numpy as np
import pandas as pd
import plotly
import plotly.figure_factory as ff
import plotly.graph_objs as go
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
data = pd.read_csv('/task_b.csv')
data=data.iloc[:,1:]

In [None]:
data.head()

Unnamed: 0,f1,f2,f3,y
0,-195.871045,-14843.084171,5.53214,1.0
1,-1217.183964,-4068.124621,4.416082,1.0
2,9.138451,4413.412028,0.425317,0.0
3,363.824242,15474.760647,1.094119,0.0
4,-768.812047,-7963.932192,1.870536,0.0


In [None]:
data.corr()['y']

f1    0.067172
f2   -0.017944
f3    0.839060
y     1.000000
Name: y, dtype: float64

In [None]:
data.std()

f1      488.195035
f2    10403.417325
f3        2.926662
y         0.501255
dtype: float64

In [None]:
X=data[['f1','f2','f3']].values
Y=data['y'].values
print(X.shape)
print(Y.shape)

(200, 3)
(200,)


# What if our features are with different variance 

<pre>
* <b>As part of this task you will observe how linear models work in case of data having feautres with different variance</b>
* <b>from the output of the above cells you can observe that var(F2)>>var(F1)>>Var(F3)</b>

> <b>Task1</b>:
    1. Apply Logistic regression(SGDClassifier with logloss) on 'data' and check the feature importance
    2. Apply SVM(SGDClassifier with hinge) on 'data' and check the feature importance

> <b>Task2</b>:
    1. Apply Logistic regression(SGDClassifier with logloss) on 'data' after standardization 
       i.e standardization(data, column wise): (column-mean(column))/std(column) and check the feature importance
    2. Apply SVM(SGDClassifier with hinge) on 'data' after standardization 
       i.e standardization(data, column wise): (column-mean(column))/std(column) and check the feature importance

</pre>

<h3><font color='blue'> Make sure you write the observations for each task, why a particular feature got more importance than others</font></h3>

## 1. Task-1

a) Logistic Regression (SGDClassifier with log loss)

In [None]:
clf_1=SGDClassifier(loss='log') 
clf_1.fit(X,Y)
print("Accuracy: ",clf_1.score(X,Y)) #Accuracy of the Model

feature_importance=clf_1.coef_[0]
print('Feature Importance Coefficients')
for index, fi in enumerate(feature_importance):
  print("f{}= {}".format(index+1,abs(feature_importance[index])))



Accuracy:  0.47
Feature Importance Coefficients
f1= 2888.3605521018426
f2= 1900.0711926308315
f3= 10267.57846259901




b) SVM (SGDClassifier with Hinge Loss)


In [None]:
clf_2=SGDClassifier(loss='hinge') 
clf_2.fit(X,Y)
print("Accuracy: ",clf_2.score(X,Y)) #Accuracy of the Model

feature_importance=clf_2.coef_[0]
print('Feature Importance Coefficients')
for index, fi in enumerate(feature_importance):
  print("f{}= {}".format(index+1,abs(feature_importance[index])))

Accuracy:  0.47
Feature Importance Coefficients
f1= 8898.536041326339
f2= 8425.3558028445
f3= 10234.290913221057


##OBSERVATIONS

* As we look at the Correlation of the features with 'y', we see that f3 has the highest correlation, followed by f1 and then f2.
* Also, f3 has the least variance, followed by f1 and then f2.
* Feature importance of f3 is the highest, followed by f1 and then f2 in both Logistic Regression and SVM.

Hence we may conclude that those features having the highest correlation with the class label and also the least variance will be assigned the highest weight (feature importance).

## 2. Task-2

a) Logistic Regression (SGDClassifier with Log Loss) after Column Standardization


In [None]:
std_scaler= StandardScaler()
std_scaler.fit_transform(X)
clf_4=SGDClassifier(loss='log')
clf_4.fit(X,Y)
print("Accuracy: ",clf_4.score(X,Y))

feature_importance=clf_4.coef_[0]
print('Feature Importance Coefficients')
for index, fi in enumerate(feature_importance):
  print("f{}= {}".format(index+1,abs(feature_importance[index])))


Accuracy:  0.47
Feature Importance Coefficients
f1= 6573.532837143187
f2= 12847.575225646253
f3= 10608.854530679522


b) SVM (SGDClassifier with hinge loss) after Column Standardization




In [None]:
std_scaler= StandardScaler()
std_scaler.fit_transform(X)
clf_5=SGDClassifier(loss='hinge')
clf_5.fit(X,Y)
print("Accuracy: ",clf_5.score(X,Y))

feature_importance=clf_5.coef_[0]
print('Feature Importance Coefficients')
for index, fi in enumerate(feature_importance):
  print("f{}= {}".format(index+1,abs(feature_importance[index])))


Accuracy:  0.525
Feature Importance Coefficients
f1= 578.1671125044395
f2= 4584.309589898638
f3= 9670.550611323984


### OBSERVATION

* Here we observe that column standardization has been done with to all 3 features. Hence now there would be no direct correlation with respect to the class label 'y' as all the features now have been scaled in the range 0 to 1.

* Here we observe that f2 has the highest feature importance followed by f3 and then f1 in Logistic Regression.

* In SVM, we find that f3 has the highest feature importance, followed by f2 and then f1. Also the accuracy of this model is the highest among all.

Hence the overall conclusion is we have to Scale the features using Standard Scaler so that there is no direct collinearity between the features and the class labels based on which the feature weights will be assigned.