In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## Choosing appropriate data : Regression 

In [3]:
#Fetch california data from sklearn
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
housing

{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
           37.88      , -122.23      ],
        [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
           37.86      , -122.22      ],
        [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
           37.85      , -122.24      ],
        ...,
        [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
           39.43      , -121.22      ],
        [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
           39.43      , -121.32      ],
        [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
           39.37      , -121.24      ]]),
 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894]),
 'frame': None,
 'target_names': ['MedHouseVal'],
 'feature_names': ['MedInc',
  'HouseAge',
  'AveRooms',
  'AveBedrms',
  'Population',
  'AveOccup',
  'Latitude',
  'Longitude'],
 'DESCR': '.. _california_housing_dataset:\n

In [20]:
#converting dictinonary into a dataframe
housing_df = pd.DataFrame(housing["data"], columns=housing["feature_names"])
housing_df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [21]:
len(housing_df)

20640

In [22]:
housing_df["Target"] = housing["target"]
housing_df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [23]:
#X y Split
np.random.seed(42)
X = housing_df.drop("Target",axis = 1)
y = housing_df["Target"]

#split the data
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [12]:
from sklearn.linear_model import Ridge
model = Ridge()
model.fit(X_train,y_train)

In [13]:
model.score(X_test,y_test)

0.5760100904666183

In [14]:
from sklearn.linear_model import Lasso
model = Lasso()
model.fit(X_train,y_train)

In [15]:
model.score(X_test,y_test)

0.28769418055406437

In [16]:
from sklearn.svm import SVR
regr = SVR()
regr.fit(X_train,y_train)

In [17]:
regr.score(X_test,y_test)

-0.026351123981301017

In [24]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
model.fit(X_train,y_train)

In [25]:
model.score(X_test, y_test)

0.8065734772187598

## Choosing appropriate data : Classification 

### Iris dataset

In [26]:
from sklearn.datasets import load_iris
iris = load_iris()
iris

{'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
  

In [27]:
#conveting into dataframe
iris_df = pd.DataFrame(iris["data"],columns=iris["feature_names"])
iris_df["Target"] = iris["target"]
iris_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),Target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [28]:
len(iris_df)

150

In [29]:
iris_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
 4   Target             150 non-null    int32  
dtypes: float64(4), int32(1)
memory usage: 5.4 KB


In [66]:
iris_df["sepal length (cm)"].value_counts()

sepal length (cm)
5.0    10
5.1     9
6.3     9
5.7     8
6.7     8
5.8     7
5.5     7
6.4     7
4.9     6
5.4     6
6.1     6
6.0     6
5.6     6
4.8     5
6.5     5
6.2     4
7.7     4
6.9     4
4.6     4
5.2     4
5.9     3
4.4     3
7.2     3
6.8     3
6.6     2
4.7     2
7.6     1
7.4     1
7.3     1
7.0     1
7.1     1
5.3     1
4.3     1
4.5     1
7.9     1
Name: count, dtype: int64

In [44]:
max(iris_df["sepal length (cm)"]) , min(iris_df["sepal length (cm)"])

(7.9, 4.3)

In [45]:
X = iris_df.drop("Target", axis = 1)
y = iris_df["Target"]

X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2)

In [48]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(X_train,y_train)


In [49]:
model.score(X_test,y_test)

1.0

In [50]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier()
clf.fit(X_train,y_train)

In [51]:
model.score(X_test,y_test)

1.0

### Heart Disease Dataset

In [71]:
len(hd)

303

In [72]:
from sklearn.svm import LinearSVC

np.random.seed(42)

hd = pd.read_csv("E:\Datasets\data\heart-disease.csv")

X = hd.drop("target", axis = 1)
y = hd["target"]
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2)

clf = LinearSVC()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)

  hd = pd.read_csv("E:\Datasets\data\heart-disease.csv")


0.8688524590163934

In [67]:
hd["target"].value_counts()

target
1    165
0    138
Name: count, dtype: int64

In [73]:
from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)

hd = pd.read_csv("E:\Datasets\data\heart-disease.csv")

X = hd.drop("target", axis = 1)
y = hd["target"]
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2)

clf = RandomForestClassifier()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)

  hd = pd.read_csv("E:\Datasets\data\heart-disease.csv")


0.8524590163934426