<a href="https://colab.research.google.com/github/BrendaAdel/APT-search-engine/blob/master/BigData.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import numpy 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier

In [0]:
def visualize_data(df):

    fig,ax = plt.subplots(2,3)
    df['Credit_History'].value_counts(ascending=True).plot(kind='bar' ,ax=ax[0,0],title="Credit History")
    df['Gender'].value_counts(ascending=True).plot(kind='bar',ax=ax[0,1],title="Gender")
    df['Education'].value_counts(ascending=True).plot(kind='bar',ax=ax[0,2],title="Education")
    df['Dependents'].value_counts(ascending=True).plot(kind='bar',ax=ax[1,0],title="Dependents")
    df['Self_Employed'].value_counts(ascending=True).plot(kind='bar',ax=ax[1,1],title="Self Employed")
    df['Married'].value_counts(ascending=True).plot(kind='bar',ax=ax[1,2],title="Married")
    plt.show()


    # continuous
    df['LoanAmount'].plot(kind='kde')
    plt.show()
    df.boxplot(column='LoanAmount')
    plt.show()

    df['ApplicantIncome'].plot(kind='kde')
    plt.show()
    df.boxplot(column='ApplicantIncome')
    plt.show()


    # relation between categorical and loan_status
    fig,ax = plt.subplots(2,3)
    credit_history_loan_status = pd.crosstab(df['Credit_History'], df['Loan_Status']).apply(lambda r: r/r.sum(), axis=1)
    credit_history_loan_status.plot(kind='bar', stacked=True, color=['red','blue'], grid=False,ax=ax[0,0],title="Credit History")

    credit_history_loan_status = pd.crosstab(df['Self_Employed'], df['Loan_Status']).apply(lambda r: r/r.sum(), axis=1)
    credit_history_loan_status.plot(kind='bar', stacked=True, color=['red','blue'], grid=False,ax=ax[0,1],title="Self Employed")

    credit_history_loan_status = pd.crosstab(df['Married'], df['Loan_Status']).apply(lambda r: r/r.sum(), axis=1)
    credit_history_loan_status.plot(kind='bar', stacked=True, color=['red','blue'], grid=False,ax=ax[0,2],title="Married")

    credit_history_loan_status = pd.crosstab(df['Education'], df['Loan_Status']).apply(lambda r: r/r.sum(), axis=1)
    credit_history_loan_status.plot(kind='bar', stacked=True, color=['red','blue'], grid=False,ax=ax[1,0],title="Education")

    credit_history_loan_status = pd.crosstab(df['Gender'], df['Loan_Status']).apply(lambda r: r/r.sum(), axis=1)
    credit_history_loan_status.plot(kind='bar', stacked=True, color=['red','blue'], grid=False,ax=ax[1,1],title="Gender")

    credit_history_loan_status = pd.crosstab(df['Dependents'], df['Loan_Status']).apply(lambda r: r/r.sum(), axis=1)
    credit_history_loan_status.plot(kind='bar', stacked=True, color=['red','blue'], grid=False,ax=ax[1,2],title="Dependents")
    #
    plt.show()

    # relation between LoanAmount, ApplicantIncome
    fig, ax = plt.subplots()
    ax.scatter(df['LoanAmount'],df['ApplicantIncome'],c='DarkBlue')
    plt.xlabel("Loan Amount")
    plt.ylabel("Income")
    plt.show()

    #correlation
    # del df['CoapplicantIncome']
    # del df['Loan_Amount_Term']
    corr = df.assign(Loan_Status=df['Loan_Status'].astype('category').cat.codes).corr().abs()
    sns.heatmap(corr,
                xticklabels=corr.columns.values,
                yticklabels=corr.columns.values)
    plt.show()


    # #remove extreme values in LoanAmount and ApplicantIncome
    # df["LoanAmount"]= np.log(df["LoanAmount"])
    # df["ApplicantIncome"]= np.log(df["ApplicantIncome"])
    #
    # df['LoanAmount'].hist(bins=50)
    # plt.show()
    # df['ApplicantIncome'].hist(bins=50)
    # plt.show()
    # df.boxplot(column='LoanAmount')
    # plt.show()
    # df.boxplot(column='ApplicantIncome')
    # plt.show()

    # plt.show()
    
    
    #show summary for numerical data
    print(df.describe().to_string()) 
    
    #show factors of values of Credit_History
    print(df['Credit_History'].unique())
    
    # Check for missing values
    objects = df.columns.values
    bins = df.isnull().sum()
    listOfMissingValuesOccurence = []
    for val in bins:
        listOfMissingValuesOccurence.append(val)
    y_pos = numpy.arange(len(objects))
    performance = listOfMissingValuesOccurence
    fig, ax = plt.subplots()
    plt.setp(ax.get_xticklabels(), rotation=30, horizontalalignment='right')
    plt.bar(y_pos, performance, align='center', alpha=0.5)
    plt.xticks(y_pos, objects)
    plt.ylabel('number of missing values')
    plt.title('missing value')
    plt.show()
    
    



In [0]:
def fillMissingValues(data,train,ob):
    # Fill missing values
    data["LoanAmount"]= numpy.log(data["LoanAmount"])
    if(train==True):
        data['LoanAmount'].fillna(data['LoanAmount'].mean(), inplace=True)
        data.Gender.fillna(data.Gender.mode()[0], inplace=True)
        data.Married.fillna(data.Married.mode()[0], inplace=True)
        data.Self_Employed.fillna(data.Self_Employed.mode()[0], inplace=True)
        data.Credit_History.fillna(data.Credit_History.mode()[0], inplace=True)
        data.Dependents.fillna(data.Dependents.mode()[0], inplace=True)
        data.Loan_Amount_Term.fillna(data.Loan_Amount_Term.mode()[0], inplace=True)
    else:
        data['LoanAmount'].fillna(ob['LoanAmount'], inplace=True)
        data.Gender.fillna(ob['Gender'], inplace=True)
        data.Married.fillna(ob['Married'], inplace=True)
        data.Self_Employed.fillna(ob['Self_Employed'], inplace=True)
        data.Credit_History.fillna(ob['Credit_History'], inplace=True)
        data.Dependents.fillna(ob['Dependents'], inplace=True)
        data.Loan_Amount_Term.fillna(ob['Loan_Amount_Term'], inplace=True)


    
    
    ob={
        'LoanAmount' :data['LoanAmount'].mean(),
        'Gender':data.Gender.mode()[0],
        'Married':data.Married.mode()[0],
        'Self_Employed':data.Self_Employed.mode()[0],
        'Credit_History':data.Credit_History.mode()[0],
        'Dependents':data.Dependents.mode()[0],
        'Loan_Amount_Term':data.Loan_Amount_Term.mode()[0],        
    }
    
    return ob

In [0]:
def encodeCategoricalData(data,train):
    if(train==True):
        var_mod = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']
    else:
        var_mod = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']
    le = LabelEncoder()
    for i in var_mod:
        data[i] = le.fit_transform(data[i])

In [0]:
def create_submission_file(df_test,predictions_test):
    submission_df = pd.DataFrame({'Loan_ID':df_test['Loan_ID'],'Loan_Status':predictions_test})
    submission_df.to_csv('outfile.csv',index=False)

In [0]:
def classifier(data,data2,model):
  
    #Case of 1 var
   
    """
    model.fit(data['Credit_History'].values.reshape(-1, 1), data['Loan_Status'])
    results = model.predict(data2['Credit_History'].values.reshape(-1, 1))

    create_submission_file(data2,results)
    
    #coefficients = model.coef_[0]
    #print(coefficients)
    """
    
    predictor=['Credit_History','LoanAmount']
    dataAsArray= numpy.array(data[predictor])
    model.fit(dataAsArray, data['Loan_Status'])

    #coefficients = model.coef_[0]
    #print(coefficients)

    data2AsArray = numpy.array(data2[predictor])
    results = model.predict(data2AsArray)
    create_submission_file(data2,results)
    
   
    

In [0]:
def featureSelection(data,model):

    sfs1 = sfs(model,k_features=2,forward=True,floating=False,verbose=2,scoring='accuracy')
    X = data.iloc[:, 1:-1]
    y = data.iloc[:, -1]
    sfs1 = sfs1.fit(X, y)
    feat_cols = list(sfs1.k_feature_idx_)
    print(feat_cols)

In [0]:
if __name__ == "__main__":
    # Read the data
    data = pd.read_csv("train.csv", sep=",")
    #visualize_data(data)
    valuesSavedFromTrainingData=fillMissingValues(data,True,-1)
    encodeCategoricalData(data,True)
    #print(data.describe(include='all').to_string())

    data2 = pd.read_csv("test.csv", sep=",")
    fillMissingValues(data2,False,valuesSavedFromTrainingData)
    #print(data2.describe(include='all').to_string())
    encodeCategoricalData(data2,False)


    #feature selection
    #clf = LogisticRegression()
    #clf=DecisionTreeClassifier()
    #clf = svm.SVC(gamma='scale')
    #featureSelection(data,clf)



    #model training and making the output file
#     model = LogisticRegression()
    model = DecisionTreeClassifier()
    #model = KNeighborsClassifier(n_neighbors=3)
#     model = RandomForestClassifier(n_estimators=25)
    #model = svm.SVC(gamma='scale')
    classifier(data,data2,model)

