<a href="https://colab.research.google.com/github/EvinduArunoda/pump-it-up-170473p/blob/main/moracse_training_170473p.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount("/content/drive")

path = '/content/drive/MyDrive/Machine Learning Project/Data'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

test = pd.read_csv(path+"/test_set_values.csv")
train_x = pd.read_csv(path+"/training_set_values.csv")
train_y = pd.read_csv(path+"/training_set_labels.csv")

train = train_y.merge(train_x, on='id')


In [3]:
y = train.pop('status_group')

train['train']=1
test['train']=0

combined = pd.concat([train, test])

y.shape

(59400,)

In [4]:
y.head()

0        functional
1        functional
2        functional
3    non functional
4        functional
Name: status_group, dtype: object

Feature Engineering & Pre-processing

In [6]:
combined['gps_height'].replace(0.0, np.nan, inplace=True)
combined['population'].replace(0.0, np.nan, inplace=True)
combined['amount_tsh'].replace(0.0, np.nan, inplace=True)
combined['longitude'].replace(0.0, np.nan, inplace=True)
combined['latitude'].replace(0.0, np.nan, inplace=True)
combined['construction_year'].replace(0.0, np.nan, inplace=True)

combined["gps_height"].fillna(combined.groupby(['region', 'district_code'])["gps_height"].transform("mean"), inplace=True)
combined["gps_height"].fillna(combined.groupby(['region'])["gps_height"].transform("mean"), inplace=True)
combined["gps_height"].fillna(combined["gps_height"].mean(), inplace=True)

combined["population"].fillna(combined.groupby(['region', 'district_code'])["population"].transform("median"), inplace=True)
combined["population"].fillna(combined.groupby(['region'])["population"].transform("median"), inplace=True)
combined["population"].fillna(combined["population"].median(), inplace=True)

combined["amount_tsh"].fillna(combined.groupby(['region', 'district_code'])["amount_tsh"].transform("median"), inplace=True)
combined["amount_tsh"].fillna(combined.groupby(['region'])["amount_tsh"].transform("median"), inplace=True)
combined["amount_tsh"].fillna(combined["amount_tsh"].median(), inplace=True)

combined["latitude"].fillna(combined.groupby(['region', 'district_code'])["latitude"].transform("mean"), inplace=True)
combined["latitude"].fillna(combined.groupby(['region'])["latitude"].transform("mean"), inplace=True)

combined["longitude"].fillna(combined.groupby(['region', 'district_code'])["longitude"].transform("mean"), inplace=True)
combined["longitude"].fillna(combined.groupby(['region'])["longitude"].transform("mean"), inplace=True)

combined["construction_year"].fillna(combined.groupby(['region', 'district_code'])["construction_year"].transform("median"), inplace=True)
combined["construction_year"].fillna(combined.groupby(['region'])["construction_year"].transform("median"), inplace=True)
combined["construction_year"].fillna(combined.groupby(['district_code'])["construction_year"].transform("median"), inplace=True)
combined["construction_year"].fillna(combined["construction_year"].median(), inplace=True)

# MinMaxScaling features
features=['amount_tsh', 'gps_height', 'population']
scaler = MinMaxScaler(feature_range=(0,20))
combined[features] = scaler.fit_transform(combined[features])

# Creating new features
combined['date_recorded'] = pd.to_datetime(combined['date_recorded'])
combined['functional_period'] = combined.date_recorded.dt.year - combined.construction_year

# DATA Cleaning - Dropping features
waste_features=['wpt_name','num_private','subvillage','region_code','recorded_by','management_group','extraction_type_group','extraction_type_class','payment','scheme_name','quality_group','quantity_group','source_type','source_class','waterpoint_type_group','ward','installer','public_meeting','permit','date_recorded','construction_year']
combined.drop(waste_features,axis=1,inplace=True)

# Convert into Lower case
combined.waterpoint_type = combined.waterpoint_type.str.lower()
combined.funder = combined.funder.str.lower()
combined.basin = combined.basin.str.lower()
combined.region = combined.region.str.lower()
combined.source = combined.source.str.lower()
combined.lga = combined.lga.str.lower()
combined.management = combined.management.str.lower()
combined.quantity = combined.quantity.str.lower()
combined.water_quality = combined.water_quality.str.lower()
combined.payment_type=combined.payment_type.str.lower()
combined.extraction_type=combined.extraction_type.str.lower()

combined["funder"].fillna("other", inplace=True)
combined["scheme_management"].fillna("other", inplace=True)

print(combined.isnull().sum())
combined.info()
combined.quantity.head(5)

id                   0
amount_tsh           0
funder               0
gps_height           0
longitude            0
latitude             0
basin                0
region               0
district_code        0
lga                  0
population           0
scheme_management    0
extraction_type      0
management           0
payment_type         0
water_quality        0
quantity             0
source               0
waterpoint_type      0
train                0
functional_period    0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
Int64Index: 74250 entries, 0 to 14849
Data columns (total 21 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 74250 non-null  int64  
 1   amount_tsh         74250 non-null  float64
 2   funder             74250 non-null  object 
 3   gps_height         74250 non-null  float64
 4   longitude          74250 non-null  float64
 5   latitude           74250 non-null  float64
 6   basin 

0          enough
1    insufficient
2          enough
3             dry
4        seasonal
Name: quantity, dtype: object

In [7]:
combined.shape

(74250, 21)

In [8]:
combined.head()

Unnamed: 0,id,amount_tsh,funder,gps_height,longitude,latitude,basin,region,district_code,lga,population,scheme_management,extraction_type,management,payment_type,water_quality,quantity,source,waterpoint_type,train,functional_period
0,69572,0.342846,roman,10.324381,34.938093,-9.856322,lake nyasa,iringa,5,ludewa,0.070822,VWC,gravity,vwc,annually,soft,enough,spring,communal standpipe,1,12.0
1,8776,0.011417,grumeti,10.387164,34.698766,-2.147466,lake victoria,mara,2,serengeti,0.182957,Other,gravity,wug,never pay,soft,insufficient,rainwater harvesting,communal standpipe,1,3.0
2,34310,0.001417,lottery club,5.413324,37.460664,-3.821329,pangani,manyara,4,simanjiro,0.163284,VWC,gravity,vwc,per bucket,soft,enough,dam,communal standpipe multiple,1,4.0
3,67743,0.002846,unicef,2.462504,38.486161,-11.155298,ruvuma / southern coast,mtwara,63,nanyumbu,0.037378,VWC,submersible,vwc,never pay,soft,dry,machine dbh,communal standpipe multiple,1,27.0
4,19728,0.014274,action in a,7.995705,31.130847,-1.825359,lake victoria,kagera,1,karagwe,0.130496,other,gravity,other,never pay,soft,seasonal,rainwater harvesting,communal standpipe,1,8.0


In [9]:
# Preprocessing
combined['funder'] = pd.factorize(combined['funder'])[0]
combined['scheme_management'] = pd.factorize(combined['scheme_management'])[0]
combined['extraction_type'] = pd.factorize(combined['extraction_type'])[0]
combined['management'] = pd.factorize(combined['management'])[0]
combined['payment_type'] = pd.factorize(combined['payment_type'])[0]
combined['water_quality'] = pd.factorize(combined['water_quality'])[0]
combined['quantity'] = pd.factorize(combined['quantity'])[0]
combined['source'] = pd.factorize(combined['source'])[0]
combined['waterpoint_type'] = pd.factorize(combined['waterpoint_type'])[0]
combined['basin'] = pd.factorize(combined['basin'])[0]
combined['region'] = pd.factorize(combined['region'])[0]
combined['lga'] = pd.factorize(combined['lga'])[0]
combined['district_code'] = pd.factorize(combined['district_code'])[0]
combined['functional_period'] = pd.factorize(combined['functional_period'])[0]
combined.district_code.head(5)

0    0
1    1
2    2
3    3
4    4
Name: district_code, dtype: int64

In [10]:
X = combined[combined["train"] == 1]
X_test = combined[combined["train"] == 0]
X.drop(["train"], axis=1, inplace=True)
X.drop(['id'],axis=1, inplace=True)
X_test.drop(["train"], axis=1, inplace=True)
X_test_id = X_test['id']
X_test.drop(['id'],axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [11]:
model_rfc = RandomForestClassifier(n_estimators=1000)

In [12]:
model = XGBClassifier(objective = 'multi:softmax', booster = 'gbtree', nrounds = 'min.error.idx', 
                      num_class = 4, maximize = False, eval_metric = 'merror', eta = .2,
                      max_depth = 14, colsample_bytree = .4)

In [13]:
cross_val_score(model_rfc, X, y, cv=3)

array([0.80378788, 0.80292929, 0.79888889])

In [14]:
model_rfc.fit(X,y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [15]:
importances = model_rfc.feature_importances_
importances
indices = np.argsort(importances)[::-1]

# Feature Ranking
for f in range(X.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

for f in range(X.shape[1]):
    print(X.columns[indices[f]],end=', ')

1. feature 3 (0.152730)
2. feature 4 (0.150417)
3. feature 15 (0.116565)
4. feature 2 (0.077482)
5. feature 17 (0.058197)
6. feature 11 (0.055726)
7. feature 1 (0.054421)
8. feature 9 (0.052051)
9. feature 18 (0.048390)
10. feature 13 (0.032805)
11. feature 8 (0.031586)
12. feature 16 (0.030584)
13. feature 0 (0.027143)
14. feature 6 (0.023292)
15. feature 7 (0.022221)
16. feature 12 (0.018388)
17. feature 5 (0.016956)
18. feature 10 (0.016676)
19. feature 14 (0.014369)
longitude, latitude, quantity, gps_height, waterpoint_type, extraction_type, funder, population, functional_period, payment_type, lga, source, amount_tsh, region, district_code, management, basin, scheme_management, water_quality, 

In [16]:
y_pred = model_rfc.predict(X_test)

In [17]:
y_pred=pd.DataFrame(y_pred)
y_pred['id']= X_test_id
y_pred.columns=['status_group','id']
y_pred=y_pred[['id','status_group']]

In [18]:
y_pred.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14850 entries, 0 to 14849
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            14850 non-null  int64 
 1   status_group  14850 non-null  object
dtypes: int64(1), object(1)
memory usage: 232.2+ KB


In [19]:
pd.DataFrame(y_pred).to_csv(path+"/submission_new.csv",index=False)