In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

In [2]:
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')
ss = pd.read_csv('SampleSubmission.csv')
variables = pd.read_csv('VariableDefinitions.csv')

In [3]:
print('train data shape :', train.shape)
print('test data shape :', test.shape)

train data shape : (23524, 13)
test data shape : (10086, 12)


In [4]:
train.head()

Unnamed: 0,country,year,uniqueid,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,2018,uniqueid_1,Yes,Rural,Yes,3,24,Female,Spouse,Married/Living together,Secondary education,Self employed
1,Kenya,2018,uniqueid_2,No,Rural,No,5,70,Female,Head of Household,Widowed,No formal education,Government Dependent
2,Kenya,2018,uniqueid_3,Yes,Urban,Yes,5,26,Male,Other relative,Single/Never Married,Vocational/Specialised training,Self employed
3,Kenya,2018,uniqueid_4,No,Rural,Yes,5,34,Female,Head of Household,Married/Living together,Primary education,Formally employed Private
4,Kenya,2018,uniqueid_5,No,Urban,No,8,26,Male,Child,Single/Never Married,Primary education,Informally employed


In [5]:
# Convert target label to numerical Data
le = LabelEncoder()
train['bank_account'] = le.fit_transform(train['bank_account'])

#Separate training features from target
X_train = train.drop(['bank_account'], axis=1)
y_train = train['bank_account']

print(y_train)

0        1
1        0
2        1
3        0
4        0
        ..
23519    0
23520    0
23521    0
23522    0
23523    0
Name: bank_account, Length: 23524, dtype: int32


In [6]:
 # Convert the following numerical labels from interger to float
float_array = X_train[["household_size", "age_of_respondent", "year"]].values.astype(float)
    
# categorical features to be onverted to One Hot Encoding
categ = ["relationship_with_head",
             "marital_status",
             "education_level",
             "job_type",
             "country"]
    
# One Hot Encoding conversion
X_train = pd.get_dummies(X_train, prefix_sep="_", columns=categ)
    
# Label Encoder conversion
X_train["location_type"] = le.fit_transform(X_train["location_type"])
X_train["cellphone_access"] = le.fit_transform(X_train["cellphone_access"])
X_train["gender_of_respondent"] = le.fit_transform(X_train["gender_of_respondent"])
    
# drop uniquid column
X_train = train.drop(["uniqueid"], axis=1)
X_train.head()

Unnamed: 0,country,year,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,2018,1,Rural,Yes,3,24,Female,Spouse,Married/Living together,Secondary education,Self employed
1,Kenya,2018,0,Rural,No,5,70,Female,Head of Household,Widowed,No formal education,Government Dependent
2,Kenya,2018,1,Urban,Yes,5,26,Male,Other relative,Single/Never Married,Vocational/Specialised training,Self employed
3,Kenya,2018,0,Rural,Yes,5,34,Female,Head of Household,Married/Living together,Primary education,Formally employed Private
4,Kenya,2018,0,Urban,No,8,26,Male,Child,Single/Never Married,Primary education,Informally employed


In [7]:
    
# scale our data into range of 0 and 1
scaler = MinMaxScaler(feature_range=(0, 1))
X_train = scaler.fit_transform(X_train)

ValueError: could not convert string to float: 'Kenya'

In [None]:
train

In [None]:
import sklearn.model_selection

In [None]:
# Split train_data
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X_Train, X_Val, y_Train, y_val = train_test_split(X_train, y_train, stratify = y_train, 
                                                  test_size = 0.3, random_state=42)
Logistic_regr = LogisticRegression()
Logistic_regr.fit(X_train, y_train)
pred = Logistic_regr.predict(X_Val)
prob = Logistic_regr.predict_proba(X_Val)
y_pred

In [None]:
def load_data(filename):
	df = pd.read_csv(filename, sep=",", index_col=False)
	df.columns = ["country","year","uniqueid","bank_account","location_type","cellphone_access",
                  "household_size","age_of_respondent", "gender_of_respondent", "relationship_with_head",
                  "marital_status", "education_level", "job_type"]
	data = np.array(df, dtype=float)
	plot_data(data[:,:2], data[:, -1])
	return data[:,:2], data[:, -1]

def plot_data(x, y):
	plt.xlabel('score of test-1')
	plt.ylabel('score of test-1')
	for i in range(x.shape[0]):
		if y[i] == 1:
			plt.plot(x[i,0], x[i,1], 'gX')
		else:
			plt.plot(x[i,0], x[i,1], 'mD')
	plt.show()

def sigmoid(z):
	return 1/(1 + np.exp(-z))

def cost_function(x, y, theta):
	h = sigmoid(x@theta)
	print(np.log(h).shape)
	one = np.ones((y.shape[0],1))
	return (-((y.T@np.log(h)) + (one-y).T@np.log(one - h))/(y.shape[0]))

def gradient_descent(x, y, theta, learning_rate=0.1, num_epochs=10):
	m = x.shape[0]
	J_all = []
	
	for _ in range(num_epochs):
		h_x = sigmoid(x@theta)
		cost_ = (1/m)*(x.T@(h_x - y))
		theta = theta - (learning_rate)*cost_
		J_all.append(cost_function(x, y, theta))

	return theta, J_all 


def plot_cost(J_all, num_epochs):
	plt.xlabel('Epochs')
	plt.ylabel('Cost')
	plt.plot(num_epochs, J_all, 'm', linewidth = "5")
	plt.show()

def predict(prob):
	if(prob >= 0.5):
		return 1
	else:
		return 0

def test(theta, x):
	y = float(sigmoid(x@theta))
	if predict(y) == 1 :
		print("Admit")
	else:
		print("Reject")

In [None]:
x, y = load_data("train.csv")
y = np.reshape(y, (y.shape[0], 1))
x = np.hstack((np.ones((x.shape[0], 1)), x))
theta = np.zeros((x.shape[1], 1))
learning_rate = 0.001
num_epochs = 100
theta, J_all = gradient_descent(x, y, theta, learning_rate, num_epochs)
J = cost_function(x, y, theta)
print(theta)
print(J)

n_epochs = []
jplot = []
count = 0
for i in J_all:
	jplot.append(i[0][0])
	n_epochs.append(count)
	count += 1
jplot = np.array(jplot)
n_epochs = np.array(n_epochs)
plot_cost(jplot, n_epochs)

test(theta, [1, 48, 85])