# Loading common libraries

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Regression metrics
from sklearn.metrics import mean_squared_error

# Classification metrics
from sklearn.metrics import accuracy_score

# Load datasets

## Load datasets for classification

In [3]:
# read the train and test dataset
class_train_data = pd.read_csv('https://raw.githubusercontent.com/vamsivarma/datasets/master/machine_learning/logistic_regression/train.csv')
class_test_data = pd.read_csv('https://raw.githubusercontent.com/vamsivarma/datasets/master/machine_learning/logistic_regression/test.csv')

# shape of the dataset
print('Shape of classification training data :', class_train_data.shape)
print('Shape of classification testing data :', class_test_data.shape)

Shape of classification training data : (712, 25)
Shape of classification testing data : (179, 25)


Separating the target variable (or column) which we want to predict using ML algorithms

In [4]:
# Now, we need to predict the missing target variable in the test data
# target variable - Survived

# seperate the independent and target variable on training data
class_train_x = class_train_data.drop(columns=['Survived'],axis=1)
class_train_y = class_train_data['Survived']

# seperate the independent and target variable on testing data
class_test_x = class_test_data.drop(columns=['Survived'],axis=1)
class_test_y = class_test_data['Survived']

class_train = pd.DataFrame(columns=['Train'])
class_train['Train'] = class_train_y

class_test = pd.DataFrame(columns=['Test'])
class_test['Test'] = class_test_y

## Load datasets for Regression

In [5]:
# These datasets are used for following algorithms
# Linear regression

# read the train and test dataset
reg_train_data = pd.read_csv('https://raw.githubusercontent.com/vamsivarma/datasets/master/machine_learning/linear_regression/train.csv')
reg_test_data = pd.read_csv('https://raw.githubusercontent.com/vamsivarma/datasets/master/machine_learning/linear_regression/test.csv')

# shape of the dataset
print('\nShape of regression training data :', reg_train_data.shape)
print('\nShape of regression testing data :', reg_test_data.shape)


Shape of regression training data : (1364, 36)

Shape of regression testing data : (341, 36)


Separating the target variable (or column) which we want to predict using ML algorithms

In [6]:
# Now, we need to predict the missing target variable in the test data
# target variable - Item_Outlet_Sales
# seperate the independent and target variable on training data
reg_train_x = reg_train_data.drop(columns=['Item_Outlet_Sales'], axis=1)
reg_train_y = reg_train_data['Item_Outlet_Sales']

# seperate the independent and target variable on training data
reg_test_x = reg_test_data.drop(columns=['Item_Outlet_Sales'], axis=1)
reg_test_y = reg_test_data['Item_Outlet_Sales']

reg_train = pd.DataFrame(columns=['Train'])
reg_train['Train'] = reg_train_y

reg_test = pd.DataFrame(columns=['Test'])
reg_test['Test'] = reg_test_y

In [7]:
## Data frames for saving prediction of different algorithms
col_list = ['DT', 'RF']

# Classification predictions for train and test set
class_train_pred = pd.DataFrame(columns = col_list)
class_test_pred = pd.DataFrame(columns = col_list)

# Regression predictions for train and test set
reg_train_pred = pd.DataFrame(columns = col_list)
reg_test_pred = pd.DataFrame(columns = col_list)

# Decision Tree

In [8]:
import sklearn.tree
dir(sklearn.tree)

['BaseDecisionTree',
 'DecisionTreeClassifier',
 'DecisionTreeRegressor',
 'ExtraTreeClassifier',
 'ExtraTreeRegressor',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '_classes',
 '_criterion',
 '_export',
 '_reingold_tilford',
 '_splitter',
 '_tree',
 '_utils',
 'export_graphviz',
 'export_text',
 'plot_tree']

### Classification

In [9]:
# importing required libraries
from sklearn.tree import DecisionTreeClassifier

'''
Create the object of the Decision Tree model
You can also add other parameters and test your code here
Some parameters are : max_depth and max_features
Documentation of sklearn DecisionTreeClassifier: 

https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
'''
model = DecisionTreeClassifier()

#print(model)

# fit the model with the training data
model.fit(class_train_x, class_train_y)

#print(dir(model))

# depth of the decision tree
print('Depth of the Decision Tree :', model.tree_.max_depth)

# predict the target on the train dataset
predict_train = model.predict(class_train_x)

# Save predictions
class_train_pred['DT'] = predict_train

# Accuray Score on train dataset
accuracy_train = accuracy_score(class_train_y, predict_train)
print('\nAccuracy for Decision tree on train dataset : ', accuracy_train)

# predict the target on the test dataset
predict_test = model.predict(class_test_x)

# Save predictions
class_test_pred['DT'] = predict_test

# Accuracy Score on test dataset
accuracy_test = accuracy_score(class_test_y, predict_test)
print('\nAccuracy for Decision tree on test dataset : ', accuracy_test)


Depth of the Decision Tree : 19

Accuracy for Decision tree on train dataset :  0.9859550561797753

Accuracy for Decision tree on test dataset :  0.7821229050279329


## Regression

In [10]:
# importing required libraries
from sklearn.tree import DecisionTreeRegressor

'''
Create the object of the Decision Tree model
You can also add other parameters and test your code here
Some parameters are : max_depth and max_features
Documentation of sklearn DecisionTreeClassifier: 

https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
'''
model = DecisionTreeRegressor()

# fit the model with the training data
model.fit(reg_train_x, reg_train_y)

#print(dir(model))

# depth of the decision tree
print('Depth of the Decision Tree :', model.tree_.max_depth)

# fit the model with the training data
model.fit(reg_train_x, reg_train_y)

# predict the target on the test dataset
predict_train = model.predict(reg_train_x)

# Save predictions
reg_train_pred['DT'] = predict_train

# Root Mean Squared Error on training dataset
rmse_train = mean_squared_error(reg_train_y, predict_train)**(0.5)
print('\nRMSE for Decision tree on train dataset : ', rmse_train)

# predict the target on the testing dataset
predict_test = model.predict(reg_test_x)

# Save predictions
reg_test_pred['DT'] = predict_test

# Root Mean Squared Error on testing dataset
rmse_test = mean_squared_error(reg_test_y, predict_test)**(0.5)
print('\nRMSE for Decision tree on test dataset : ', rmse_test)

Depth of the Decision Tree : 31

RMSE for Decision tree on train dataset :  0.0

RMSE for Decision tree on test dataset :  1399.8587741344759


# Random Forest

In [11]:
import sklearn.ensemble
dir(sklearn.ensemble)

['AdaBoostClassifier',
 'AdaBoostRegressor',
 'BaggingClassifier',
 'BaggingRegressor',
 'BaseEnsemble',
 'ExtraTreesClassifier',
 'ExtraTreesRegressor',
 'GradientBoostingClassifier',
 'GradientBoostingRegressor',
 'HistGradientBoostingClassifier',
 'HistGradientBoostingRegressor',
 'IsolationForest',
 'RandomForestClassifier',
 'RandomForestRegressor',
 'RandomTreesEmbedding',
 'StackingClassifier',
 'StackingRegressor',
 'VotingClassifier',
 'VotingRegressor',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '_bagging',
 '_base',
 '_forest',
 '_gb',
 '_gb_losses',
 '_gradient_boosting',
 '_hist_gradient_boosting',
 '_iforest',
 '_stacking',
 '_voting',
 '_weight_boosting']

In [12]:
help(sklearn.ensemble.RandomForestClassifier)

Help on class RandomForestClassifier in module sklearn.ensemble._forest:

class RandomForestClassifier(ForestClassifier)
 |  RandomForestClassifier(n_estimators=100, *, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None)
 |  
 |  A random forest classifier.
 |  
 |  A random forest is a meta estimator that fits a number of decision tree
 |  classifiers on various sub-samples of the dataset and uses averaging to
 |  improve the predictive accuracy and control over-fitting.
 |  The sub-sample size is controlled with the `max_samples` parameter if
 |  `bootstrap=True` (default), otherwise the whole dataset is used to build
 |  each tree.
 |  
 |  Read more in the :ref:`User Guide <forest>`.
 |  
 |  Parameters
 |  ----------


### Classification

In [13]:
# importing required libraries
from sklearn.ensemble import RandomForestClassifier

'''
Create the object of the Random Forest model
You can also add other parameters and test your code here
Some parameters are : n_estimators and max_depth
Documentation of sklearn RandomForestClassifier: 

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
'''

model = RandomForestClassifier()

# fit the model with the training data
model.fit(class_train_x, class_train_y)

# number of trees used
print('Number of Trees used : ', model.n_estimators)

# predict the target on the train dataset
predict_train = model.predict(class_train_x)
#print('\nTarget on train data',predict_train) 

# Save predictions
class_train_pred['RF'] = predict_train

# Accuray Score on train dataset
accuracy_train = accuracy_score(class_train_y, predict_train)
print('\nAccuracy for Random forest on train dataset : ', accuracy_train)

# predict the target on the test dataset
predict_test = model.predict(class_test_x)
#print('\nTarget on test data',predict_test) 

# Save predictions
class_test_pred['RF'] = predict_test

# Accuracy Score on test dataset
accuracy_test = accuracy_score(class_test_y, predict_test)
print('\nAccuracy for Random forest on test dataset : ', accuracy_test)


Number of Trees used :  100

Accuracy for Random forest on train dataset :  0.9859550561797753

Accuracy for Random forest on test dataset :  0.7988826815642458


### Regression

In [14]:
# importing required libraries
from sklearn.ensemble import RandomForestRegressor

'''
Create the object of the Random Forest model
You can also add other parameters and test your code here
Some parameters are : n_estimators and max_depth
Documentation of sklearn RandomForestRegressor: 

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html
'''

model = RandomForestRegressor()  

# fit the model with the training data
model.fit(reg_train_x, reg_train_y)

# Number of Neighbors used to predict the target
print('\nNumber of Trees used : ',model.n_estimators)

# predict the target on the train dataset
predict_train = model.predict(reg_train_x)
#print('\nTarget on train data',predict_train) 

# Save predictions
reg_train_pred['RF'] = predict_train


# Root Mean Squared Error on training dataset
rmse_train = mean_squared_error(reg_train_y, predict_train)**(0.5)
print('\nRMSE for Random Forest on train dataset : ', rmse_train)

# predict the target on the testing dataset
predict_test = model.predict(reg_test_x)

# Save predictions
reg_test_pred['RF'] = predict_test

# Root Mean Squared Error on testing dataset
rmse_test = mean_squared_error(reg_test_y, predict_test)**(0.5)
print('\nRMSE for Random Forest on test dataset : ', rmse_test)


Number of Trees used :  100

RMSE for Random Forest on train dataset :  436.5000643692796

RMSE for Random Forest on test dataset :  1057.792299157838


In [15]:
class_train_pred.head()

Unnamed: 0,DT,RF
0,0,0
1,1,1
2,1,1
3,0,0
4,0,0


In [16]:
class_test_pred.head()

Unnamed: 0,DT,RF
0,0,0
1,0,0
2,0,0
3,1,1
4,1,1


In [17]:
reg_train_pred.head()

Unnamed: 0,DT,RF
0,291.6204,460.247566
1,2163.1842,2001.49467
2,2387.5588,2549.98071
3,161.1236,190.778332
4,1981.4208,2215.908902


In [18]:
reg_test_pred.head()

Unnamed: 0,DT,RF
0,1272.3438,1466.717452
1,2306.997,2842.10046
2,2631.2416,2219.870412
3,729.051,2680.237822
4,3149.234,2608.897352


In [19]:
# Save original data
class_train.to_csv('class_train.csv', sep='\t', index=False)

class_test.to_csv('class_test.csv', sep='\t', index=False)

reg_train.to_csv('reg_train.csv', sep='\t', index=False)

reg_test.to_csv('reg_test.csv', sep='\t', index=False)

In [20]:
#Export predictions to csv, delimit by tab

class_train_pred.to_csv('class_train_pred.csv', sep='\t', index=False)

class_test_pred.to_csv('class_test_pred.csv', sep='\t', index=False)

reg_train_pred.to_csv('reg_train_pred.csv', sep='\t', index=False)

reg_test_pred.to_csv('reg_test_pred.csv', sep='\t', index=False)