In [1]:
# import the modules

import pandas as pd
import numpy as np
import os 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import r2_score, roc_auc_score, accuracy_score

In [2]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

### Load the dataset

- Load the train data and using all your knowledge try to explore the different statistical properties of the dataset.

In [3]:
# Code starts here
train = pd.read_csv('train.csv')
sample_submission = pd.read_csv('sample_submission.csv')
columns_dict = pd.read_excel('Mars_Crater_Data_Dictionary.xlsx')

# Code ends here

In [4]:
train.drop(['Unnamed: 0','Id'],1,inplace=True)

In [5]:
train

Unnamed: 0,attr0,attr1,attr2,attr3,attr4,attr5,attr6,attr7,attr8,attr9,...,attr1080,attr1081,attr1082,attr1083,attr1084,attr1085,attr1086,attr1087,attr1088,attr1089
0,-4.374765,13.819856,14.656331,-9.728919,-19.334897,0.344455,11.105720,21.977302,14.822923,-24.729940,...,89.083581,86.194838,93.162055,100.883355,123.558503,112.831384,100.583377,102.194939,120.306692,0
1,-13.796261,-4.647589,21.676617,-0.122074,11.228644,-8.806895,-9.161190,18.025709,4.948527,-11.680861,...,100.750899,83.373142,76.902208,72.182997,102.843819,93.118477,80.338570,80.196648,93.995657,0
2,-2.115400,-3.332400,-6.640000,-13.825000,4.123200,27.365000,6.700200,3.783000,8.909500,1.453900,...,52.917000,34.799000,42.562000,51.161000,77.139000,73.367000,50.733000,39.949000,60.731000,0
3,-25.531000,66.699000,-13.025000,-31.198000,12.016000,19.365000,5.045100,20.418000,24.372000,18.163000,...,49.488000,71.633000,66.757000,69.213000,97.606000,81.416000,53.808000,41.489000,71.825000,0
4,18.993000,-5.620000,-9.964900,3.307200,0.999760,-10.920000,-11.392000,3.918500,-1.168300,1.918500,...,84.508000,89.976000,61.169000,33.132000,58.043000,54.522000,80.941000,53.000000,80.615000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5887,3.460759,-6.426485,-21.915368,-18.168091,-13.957889,0.808523,18.660712,18.027418,-2.734253,-37.448656,...,82.641055,79.544054,91.319431,72.672363,86.753505,83.125695,82.865381,96.963409,89.824741,1
5888,3.817495,-6.340042,-22.733775,-2.020537,-18.930786,-2.285926,18.364424,-18.088820,-10.650913,1.497070,...,81.946950,92.665677,100.947021,110.488146,106.245335,122.348703,91.534937,88.677300,113.408667,0
5889,-1.991699,4.785694,7.049837,12.778358,6.959476,-4.428521,-0.527600,3.583282,5.319734,9.606778,...,42.946710,43.569139,37.283671,31.344225,35.061921,31.225543,31.868870,26.024553,26.415935,1
5890,0.785752,-3.375597,5.176214,9.699568,-5.612654,-9.942261,1.703274,6.021695,4.450182,3.766676,...,71.683412,44.407654,41.413167,53.524387,71.596267,76.988948,85.216823,71.966462,79.820469,1


### Visualize the data

- Check the distribution of the target variable. Is the data imbalanced?
- Clean the data, apply some data preprocessing and engineering techniques.

In [6]:
# Code starts here




# Code ends here.

### Model building

- Split the data into train and test.
- Now let's come to the actual task, predict the values of `attr1089` after building a Machine learning model.
- Try improving upon the `roc_auc_score` ([ROC-AUC Score](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html#sklearn.metrics.roc_auc_score))

In [7]:
# Code Starts here
X = train.drop('attr1089',1)
y = train['attr1089']

X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 0, test_size = 0.3)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

clf = RandomForestClassifier(min_samples_split = 12,random_state = 0, criterion = 'entropy')

clf.fit(X_train, y_train)
predicted = clf.predict(X_test)
print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))

# Code ends here

0.9946653734238603
0.918552036199095


In [8]:
pd.DataFrame(predicted)[0].value_counts()

1    997
0    771
Name: 0, dtype: int64

In [9]:
roc_auc_score(y_test, predicted)

0.9162501364182036

### Prediction on the test data and creating the sample submission file.

- Load the test data and store the `Id` column in a separate variable.
- Perform the same operations on the test data that you have performed on the train data.
- Create the submission file as a `csv` file consisting of the `Id` column from the test data and your prediction as the second column.

In [10]:
# Code Starts here

test = pd.read_csv('test.csv')
Id = test['Id']
test.drop(['Id','Unnamed: 0'], 1, inplace = True)

# Code ends here

In [11]:
test = scaler.transform(test)
pred = clf.predict(test)

In [12]:
submission = pd.DataFrame({'Id': Id, 'attr1089': pred})
submission

Unnamed: 0,Id,attr1089
0,3531,0
1,3916,0
2,3065,1
3,3465,1
4,5619,0
...,...,...
1468,6249,0
1469,2158,0
1470,5428,1
1471,408,0


In [13]:
submission.to_csv('first_submission.csv', index = False)

In [14]:
sample_submission

Unnamed: 0,Id,attr1089
0,3531,1
1,3916,0
2,3065,0
3,3465,0
4,5619,0
...,...,...
1468,6249,0
1469,2158,0
1470,5428,0
1471,408,0
