# Kaggle Income Prediction

In [None]:
import numpy as np #for linear algebra
import pandas as pd # CSV file I/O 

import matplotlib #plotting library 

import matplotlib.pyplot as plt
from scipy.stats import skew
from scipy.stats.stats import pearsonr

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
#Reading from data files
train = pd.read_csv('H:\Kaggle Data\Kaggle\tcd ml 2019-20 income prediction submission file example (random solutions).csv') 
test = pd.read_csv('H:\Kaggle Data\Kaggle\tcd ml 2019-20 income prediction test (without labels).csv')

#Remove specified columns as data is irrelevant
train = train.drop(['Wears Glasses', 'Hair Color', 'Body Height [cm]'], axis=1)
test = test.drop(['Wears Glasses', 'Hair Color', 'Body Height [cm]'], axis=1)

In [None]:
in_file.head()

In [None]:
#income is more than 5000000 
train = train[train['Income in EUR'] < 5000000 ]

# when AGE is 103 and income is more than 3000000(outlier in plot)
train = train[train['Instance'] != 54704 ]
# when AGE is more than 112, no more relevant data
train = train[train['Age'] < 112]

In [None]:
#Concatinating the trained and tested data
all_data = pd.concat((train.loc[:,'Year of Record':'Profession'],
                      test.loc[:,'Year of Record':'Profession']))

In [None]:
matplotlib.rcParams['figure.figsize'] = (12.0, 6.0)
income = pd.DataFrame({"Income":train["Income in EUR"], "log(Income + 1)":np.log1p(train["Income in EUR"])})
income.hist()

In [None]:
#log transform the target:
train["Income in EUR"] = np.log1p(train["Income in EUR"])

#log transform skewed numeric features:
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index

skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) 
skewed_feats = skewed_feats[skewed_feats > 0.65]
skewed_feats = skewed_feats.index

all_data[skewed_feats] = np.log1p(all_data[skewed_feats])

In [None]:
#filling NA values of variables
train['Gender'] = train['Gender'].fillna('UNKNOWN')
#replacing 0 values with unknown variables
train['Gender'] = train['Gender'].replace('0', 'UNKNOWN')
train['Gender'] = train['Gender'].replace('unknown', 'UNKNOWN')

test['Gender'] = test['Gender'].fillna('UNKNOWN')
test['Gender'] = test['Gender'].replace('0', 'UNKNOWN')
test['Gender'] = test['Gender'].replace('unknown', 'UNKNOWN')


train['University Degree'] = train['University Degree'].replace('0', 'Unknown University')
train['University Degree'] = train['University Degree'].fillna('Unknown University')

test['University Degree'] = test['University Degree'].replace('0', 'Unknown University')
test['University Degree'] = test['University Degree'].fillna('Unknown University')


train['Profession'] = train['Profession'].fillna('Unknown Profession')
test['Profession'] = test['Profession'].fillna('Unknown Profession')

train['Income in EUR'] = train['Income in EUR'].fillna(int(train['Income in EUR'].mean()))

#hot encoding
all_data = pd.get_dummies(all_data)

#detecting outliers 
plt.scatter(train['University Degree'], train['Income in EUR'], c = "blue", marker = "s")
plt.title("Looking for outliers")
plt.xlabel("University Degree")
plt.ylabel("Income in EUR")
plt.show()


In [None]:

dum = train[train['Income in EUR'] < 5000000]
len(dum)

In [None]:

plt.scatter(train['Age'], train['Income in EUR'], c = "blue", marker = "s")
plt.title("Looking for outliers")
plt.xlabel("Age")
plt.ylabel("Income in EUR")
plt.show()


In [None]:
dum = train[train['Age'] > 100]
dum
plt.scatter(dum['Age'], dum['Income in EUR'], c = "blue", marker = "s")
plt.title("Looking for outliers")
plt.xlabel("Age more than 100")
plt.ylabel("Income in EUR")
plt.show()


In [None]:

plt.scatter(train['Gender'], train['Income in EUR'], c = "blue", marker = "s")
plt.title("Looking for outliers")
plt.xlabel("Gender")
plt.ylabel("Income in EUR")
plt.show()


In [None]:
len(dum)

In [None]:
#removing the outliers
du = train[train['Instance'] != 54704]
du

In [None]:

plt.scatter(train['Gender'], train['Income in EUR'], c = "blue", marker = "s")
plt.title("Looking for outliers")
plt.xlabel("Gender")
plt.ylabel("Income in EUR")
plt.show()

In [None]:

plt.scatter(train['Profession'], train['Income in EUR'], c = "blue", marker = "s")
plt.title("Looking for outliers")
plt.xlabel("Profession")
plt.ylabel("Income in EUR")
plt.show()

In [None]:

plt.scatter(train['Country'], train['Income in EUR'], c = "blue", marker = "s")
plt.title("Looking for outliers")
plt.xlabel("Country")
plt.ylabel("Income in EUR")
plt.show()

In [None]:

plt.scatter(train['Year of Record'], train['Income in EUR'], c = "blue", marker = "s")
plt.title("Looking for outliers")
plt.xlabel("Year of Record")
plt.ylabel("Income in EUR")
plt.show()

In [None]:
#Replacing null values with mean value of the column
all_data = all_data.fillna(all_data.mean())

In [None]:
X_train = all_data[:train.shape[0]]

X_test = all_data[train.shape[0]:]
y = train['Income in EUR']
X_train.isnull().sum().sum()
y.isnull().sum()

In [None]:
#Prediction algorithms
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, LassoLarsCV
from sklearn.model_selection import cross_val_score

def rmse_cv(model):
    rmse= np.sqrt(-cross_val_score(model, X_train, y, scoring="neg_mean_squared_error", cv = 5))
    return(rmse)

In [None]:
model_ridge = Ridge()

In [None]:
model_cv_ridge = RidgeCV(alphas = [1, 0.1, 0.001, 0.0005]).fit(X_train, y)

In [None]:
model_lasso = LassoCV(alphas = [1, 0.1, 0.001, 0.0005]).fit(X_train, y)

In [None]:
coef = pd.Series(model_lasso.coef_, index = X_train.columns)


In [None]:
print("Lasso picked " + str(sum(coef != 0)) + " variables and eliminated the other " +  str(sum(coef == 0)) + " variables")


In [None]:
imp_coef = pd.concat([coef.sort_values().head(10), coef.sort_values().tail(10)])


In [None]:
#plotting and checking the coeficient values
matplotlib.rcParams['figure.figsize'] = (8.0, 10.0)
imp_coef.plot(kind = "barh")
plt.title("Coefficients in the Lasso Model")


In [None]:
matplotlib.rcParams['figure.figsize'] = (6.0, 6.0)

preds = pd.DataFrame({"preds":model_lasso.predict(X_train), "true":y})
preds["residuals"] = preds["true"] - preds["preds"]
preds.plot(x = "preds", y = "residuals",kind = "scatter")

In [None]:
#Prediction
cv_ridge_preds = np.expm1(model_cv_ridge.predict(X_test))
lasso_preds = np.expm1(model_lasso.predict(X_test))


In [None]:
preds = 0.6*lasso_preds + 0.4*cv_ridge_preds

In [None]:
solution = pd.DataFrame({"Instance":test.Instance, "Income":preds})
solution

In [None]:
#exporting the data
solution.to_csv("tcd ml 2019-20 income prediction submission file.csv", index = False)