In [None]:
### BEGIN SOLUTION

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
train = pd.read_csv("../input/ucfai-dsg-fa19-default/train.csv")
test = pd.read_csv("../input/ucfai-dsg-fa19-default/test.csv")
ID_test = test['id']

In [None]:
train['GOOD_STANDING'].value_counts()

In [None]:
# So there are 9x as many good loans as bad (naturally, any reputable lender would avoid bad loans)
# This is problomatic, because most models will notice that most features are associated with good loans
# Therefore, they will most likely just predict all good loans. Why is this a problem?

# The score for this comp is an AUC ROC metric. In an oversimplified sense, this score is based on both
# how precise your positives are AND your negatives
# If you guess on either of them, you should expect the lowest score (0.5)

# There are almost 1 million examples, it is safe to undersample
# Undersampling is basically where we only use a subset of the training data so that our good loans/bad loans are equal
# The simple solution to this is just to randomly choose good loans to use until we are equal to bad loans
# Here is how we are going to undersample
import numpy as np

# Give me the -length - of the subset of -train- made up of entries with GOOD_STANDING == 0 
# In otherwords, how many bad loans are there?
bad_standing_len = len(train[train["GOOD_STANDING"] == 0])

# Give me the index of the subset of train where good_standing == 1 
# In otherwords, give me the index of all the good loans
good_standing_index = train[train['GOOD_STANDING'] == 1].index

# Randomly choose indices of good loans equal to the number of bad loans
random_index = np.random.choice(good_standing_index, bad_standing_len, replace=False)

# Give me the index of all the bad loans in train
bad_standing_index = train[train['GOOD_STANDING'] == 0].index

# Concatonate the indices of bad loans, and our randomly sampled good loans
under_sample_index = np.concatenate([bad_standing_index, random_index])

# Create a new pandas dataframe made only of these indices 
under_sample = train.loc[under_sample_index]



In [None]:
# Make sure it works, and make this undersampled dataframe our train
train['GOOD_STANDING'].value_counts()
under_sample['GOOD_STANDING'].value_counts()
train = under_sample

In [None]:
# As we did in Titanic, lets concatonate train and test
train.head()
train_len = len(train)
dataset =  pd.concat(objs=[train, test], axis=0).reset_index(drop=True)

In [None]:
dataset = dataset.fillna(np.nan)

In [None]:
len(dataset.index)

In [None]:
# There are a lot of columns and a lot of nulls, so I'm going to just delete features that have more than 20% of the data missing and go from there

null_list = dataset.isnull().sum()
for column, missing_num in null_list.iteritems():
    if column != "GOOD_STANDING":
        if missing_num / len(dataset.index) > 0.2:
            dataset = dataset.drop([column], axis = 1)
dataset.isnull().sum()

In [None]:
# Since "sub grade" exists, grade is kind of redundant, let's just get rid of grade
dataset.drop(['grade'], axis=1, inplace=True)

# We're also going to remove issue date because, as we discussed last week, the issue date
# between the train and the test is unbalanced. Therefore, there is likely not much to learn from it
dataset.drop(['issue_d'], axis=1, inplace=True)


# I'm also going to remove employee title. This might seem problamatic, but consider two things
# We already have annual income, really how much more info can we gloss from this?
# If we have to turn these into dummy variables, as we tend to do, there are a LOT of different titles, they will be sparse
dataset.drop(['emp_title'], axis=1, inplace=True)

In [None]:
# Under most circumstances, you want to go through and replace nulls intelligently, but to give you an idea on how to efficiently clean
# this dataset, let's try replacing all continious values with the mean, and all categorical values with the mode

number_set = set(dataset._get_numeric_data().columns)
for i,j in dataset.iteritems():
    print("Now handeling", i)
    # Let's break this down (it was also used in titanic)
    # For each column in the dataset, take the subset of that column made up of null entries for that column
    # Then, take that subset's indices and transform it into a list
    NaN_index = list(dataset[i][dataset[i].isnull()].index)
    # Skip the target variable obviously
    if (i == 'GOOD_STANDING'):
        continue
    if i in number_set:
        # If we are dealing with numerial values, take the median
        med = dataset[i].mean()
        for x in NaN_index:
            #print("did I get here")
            dataset[i].iloc[x] = med
    else:
        # Otherwise, just take the most frequent categorical value 
        mode = dataset[i].value_counts().idxmax()
        for x in NaN_index:
           # print("what about here")
            dataset[i].iloc[x] = mode
            


In [None]:
# We're going to drop a couple categorical values that have way to many possible values
# There are certainly ways you can utilize this, expecially the dates, but for the time being we will remove them
# If there are too many possible values, get_dummies creates too many new columns
dataset = dataset.drop(['earliest_cr_line', 'last_credit_pull_d', 'last_pymnt_d', 'addr_state', 'title'], axis=1)

categorical_features = list(set(dataset.columns) - set(dataset._get_numeric_data().columns))

print(categorical_features)
dataset = pd.get_dummies(dataset, columns=categorical_features)
dataset.head()

In [None]:
dataset.head()
dataset.isnull().sum()

In [None]:
# Separate train and test
train = dataset[:train_len]
test = dataset[train_len:]
# Drop the good standing from test (which should all be empty)
test.drop(labels=["GOOD_STANDING"],axis = 1,inplace=True)

# Make sure they are ints
train["GOOD_STANDING"] = train["GOOD_STANDING"].astype(int)

Y_train = train["GOOD_STANDING"]

X_train = train.drop(labels = ["GOOD_STANDING"],axis = 1)

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Let's jus tuse a basic random forest
RF = RandomForestClassifier()
RF.fit(X_train, Y_train)

In [None]:
test_standing = pd.Series(RF.predict(test), name="GOOD_STANDING")

results = pd.concat([ID_test,test_standing],axis=1)

results.to_csv("GradePrediction.csv",index=False)

In [None]:
### END SOLUTION