In [16]:
import pandas as pd
data = pd.read_csv('data/marriage.csv')
data.head()

Unnamed: 0,id,gender,height,religion,caste,mother_tongue,profession,location,country,age_of_marriage
0,1,female,"5'4""",,others,Telugu,,London,United Kingdom,21.0
1,2,male,"5'7""",Jain,Shwetamber,Gujarati,Doctor / Healthcare Professional,Fairfax- VA,USA,32.0
2,3,male,"5'7""",Hindu,Brahmin,Hindi,Entrepreneurs / Business,Begusarai,India,32.0
3,4,female,"5'0""",Hindu,Thakur,Hindi,Architect,Mumbai,India,30.0
4,5,male,"5'5""",Christian,Born Again,Malayalam,Sales Professional / Marketing,Sulthan Bathery,India,30.0


In [17]:
data.shape

(2567, 10)

In [18]:
# finding how many null values we have
data.isnull().sum()

id                   0
gender              29
height             118
religion           635
caste              142
mother_tongue      164
profession         330
location           155
country             16
age_of_marriage     19
dtype: int64

In [19]:
# dropping the null values
data.dropna().shape

(1932, 10)

In [20]:
# Percentages of dropped values
(data.shape[0] - data.dropna().shape[0]) / data.shape[0]

0.24737047136735488

In [21]:
data.dropna(inplace=True)

In [22]:
data.shape

(1932, 10)

In [23]:
data.head()

Unnamed: 0,id,gender,height,religion,caste,mother_tongue,profession,location,country,age_of_marriage
1,2,male,"5'7""",Jain,Shwetamber,Gujarati,Doctor / Healthcare Professional,Fairfax- VA,USA,32.0
2,3,male,"5'7""",Hindu,Brahmin,Hindi,Entrepreneurs / Business,Begusarai,India,32.0
3,4,female,"5'0""",Hindu,Thakur,Hindi,Architect,Mumbai,India,30.0
4,5,male,"5'5""",Christian,Born Again,Malayalam,Sales Professional / Marketing,Sulthan Bathery,India,30.0
5,6,male,"5'5""",Hindu,Valmiki,Hindi,Sportsman,Delhi,India,29.0


In [24]:
# Splitting the data into X and y
X = data.loc[:, ['gender', 'height', 'religion', 'caste', 'mother_tongue', 'country']]
y = data.age_of_marriage

In [25]:
X.head()

Unnamed: 0,gender,height,religion,caste,mother_tongue,country
1,male,"5'7""",Jain,Shwetamber,Gujarati,USA
2,male,"5'7""",Hindu,Brahmin,Hindi,India
3,female,"5'0""",Hindu,Thakur,Hindi,India
4,male,"5'5""",Christian,Born Again,Malayalam,India
5,male,"5'5""",Hindu,Valmiki,Hindi,India


In [26]:
y.head()

1    32.0
2    32.0
3    30.0
4    30.0
5    29.0
Name: age_of_marriage, dtype: float64

## Data Clearning

In [27]:
# converting the strings to numbers using sklearn preprocessing tool
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
X.loc[:, ['gender', 'height', 'religion', 'caste', 'mother_tongue', 'country']] = \
X.loc[:, ['gender', 'height', 'religion', 'caste', 'mother_tongue', 'country']].apply(enc.fit_transform)

In [28]:
X.head()

Unnamed: 0,gender,height,religion,caste,mother_tongue,country
1,1,16,2,34,6,19
2,1,16,1,14,8,5
3,0,7,1,36,8,5
4,1,14,0,13,13,5
5,1,14,1,38,8,5


In [30]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=0)

## Building the Model

In [41]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=90,max_depth=11)
model.fit(X_train,y_train)
y_predict = model.predict(X_test)

## Evalution

In [42]:
from sklearn.metrics import mean_absolute_error, r2_score
print("MAE : ", mean_absolute_error(y_test, y_predict))
r2_score(y_test, y_predict)

MAE :  1.045354054110469


0.6908793706282866

## Export model

In [44]:
# from sklearn.externals import joblib
import joblib
joblib.dump(model,'marriage_age_predict_model.ml')

['marriage_age_predict_model.ml']