In [1]:
# importing pandas for reading datasets
import pandas as pd

#for dividing whole training dataset into train and test
from sklearn.model_selection import train_test_split

#for accuracy checking
from sklearn.metrics import accuracy_score

In [2]:
# Reading the train and test dataset
train_data = pd.read_csv("forest-cover-type-prediction/train.csv")
test_data = pd.read_csv("forest-cover-type-prediction/test.csv")
submission = pd.read_csv("forest-cover-type-prediction/sampleSubmission.csv")

print (train_data.head())
print (test_data.head())




   Id  Elevation  Aspect  Slope  Horizontal_Distance_To_Hydrology  \
0   1       2596      51      3                               258   
1   2       2590      56      2                               212   
2   3       2804     139      9                               268   
3   4       2785     155     18                               242   
4   5       2595      45      2                               153   

   Vertical_Distance_To_Hydrology  Horizontal_Distance_To_Roadways  \
0                               0                              510   
1                              -6                              390   
2                              65                             3180   
3                             118                             3090   
4                              -1                              391   

   Hillshade_9am  Hillshade_Noon  Hillshade_3pm     ...      Soil_Type32  \
0            221             232            148     ...                0   
1           

In [3]:
# if standard deviation of any column is less than 0.1 that column is removed from both train and test dataset
for i in train_data.columns:
    if(train_data[i].std() < 0.1):
        train_data.drop([i],axis=1,inplace=True)  
        test_data.drop([i],axis=1,inplace=True)
      

In [4]:
# independent and target variable is seperated
train_x = train_data.drop(columns=['Cover_Type'],axis=1)
train_y = train_data['Cover_Type']


#if we are predicting for unseed testing data
#test_x = test_data
#test_y = test_data['Cover_Type']

#if we are predicting on the test data taken from the training set
train_x, test_x, train_y, test_y = train_test_split(train_data,train_y , test_size=0.3, random_state=100)

In [5]:

#################  LOGISTIC REGRESSION ##############
from sklearn.linear_model import LogisticRegression

LR_model = LogisticRegression(random_state=45)

##################### RANDOM FOREST #################
from sklearn.ensemble import RandomForestClassifier

RF_model = RandomForestClassifier(n_estimators=100, random_state=45)

##################### BAGGING #################
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

base_estimator = DecisionTreeClassifier(random_state=45,max_depth=20)
BAG_model = BaggingClassifier(base_estimator=base_estimator, n_estimators=100, random_state=45)

################   XG BOOST #####################
from xgboost import XGBClassifier

XG_model = XGBClassifier(subsample=0.20,n_estimators=100, random_state=45)


In [6]:
#LOGISTIC REGRESSION

# fit the model with the training data
LR_model.fit(train_x,train_y)

# predict the target on the train dataset
predict_train = LR_model.predict(train_x)
print('\nTarget on train data',predict_train)

# Accuray Score on train dataset
accuracy_train = accuracy_score(train_y,predict_train)
print('\nAccuracy on train dataset : ', accuracy_train)

# predict the target on the test dataset
predict_test = LR_model.predict(test_x)
print('\nPrediction on test data',predict_test) 

# Accuray Score on test dataset
accuracy_test = accuracy_score(test_y,predict_test)
print('\nAccuracy on test dataset : ', accuracy_test)




Target on train data [3 1 3 ... 3 5 4]

Accuracy on train dataset :  0.9429327286470144

Prediction on test data [7 5 1 ... 7 5 1]

Accuracy on test dataset :  0.9329805996472663


In [7]:
#RANDOM FOREST

# fit the model with the training data
RF_model.fit(train_x,train_y)

# predict the target on the train dataset
predict_train = RF_model.predict(train_x)
print('\nTarget on train data',predict_train)

# Accuray Score on train dataset
accuracy_train = accuracy_score(train_y,predict_train)
print('\naccuracy_score on train dataset : ', accuracy_train)

# predict the target on the test dataset
predict_test = RF_model.predict(test_x)
print('\nTarget on test data',predict_test) 

# Accuray Score on test dataset
accuracy_test = accuracy_score(test_y,predict_test)
print('\nAccuracy on test dataset : ', accuracy_test)


Target on train data [3 1 3 ... 3 5 4]

accuracy_score on train dataset :  1.0

Target on test data [7 5 1 ... 7 5 1]

Accuracy on test dataset :  0.9986772486772487


In [8]:
#BAGGING

# fit the model with the training data
BAG_model.fit(train_x,train_y)

# predict the target on the train dataset
predict_train = BAG_model.predict(train_x)
print('\nTarget on train data',predict_train)

# Accuray Score on train dataset
accuracy_train = accuracy_score(train_y,predict_train)
print('\naccuracy_score on train dataset : ', accuracy_train)

# predict the target on the test dataset
predict_test = BAG_model.predict(test_x)
print('\nTarget on test data',predict_test) 

# Accuray Score on test dataset
accuracy_test = accuracy_score(test_y,predict_test)
print('\nAccuracy on test dataset : ', accuracy_test)


Target on train data [3 1 3 ... 3 5 4]

accuracy_score on train dataset :  1.0

Target on test data [7 5 1 ... 7 5 1]

Accuracy on test dataset :  1.0


In [9]:
#XGBOOST

# fit the model with the training data
XG_model.fit(train_x,train_y)

# predict the target on the train dataset
predict_train = XG_model.predict(train_x)
print('\nTarget on train data',predict_train) 

# Accuray Score on train dataset
accuracy_train = accuracy_score(train_y,predict_train)
print('\naccuracy_score on train dataset : ', accuracy_train)

# predict the target on the test dataset
predict_test = XG_model.predict(test_x)
print('\nTarget on test data',predict_test) 

# Accuray Score on test dataset
accuracy_test = accuracy_score(test_y,predict_test)
print('\nAccuracy on test dataset : ', accuracy_test)


Target on train data [3 1 3 ... 3 5 4]

accuracy_score on train dataset :  1.0

Target on test data [7 5 1 ... 7 5 1]

Accuracy on test dataset :  1.0


In [10]:
#SUBMISSION


#submission['Cover_Type'] = predict_test
#submission.to_csv("Submission.csv",columns=['Id','Cover_Type'])