In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import warnings
warnings.filterwarnings("ignore")
from scipy.stats import fisher_exact
import scipy.stats as stats
from scipy.stats import norm, skew
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import f1_score, make_scorer
import pickle
from xgboost import XGBClassifier

In [None]:
data = pd.read_csv("train_file.csv")
data1 = pd.read_csv("test_file.csv")
Topic_features  = pd.read_csv("Topic_distribution.csv")

In [None]:
def dummies_Encoding(data):
    Permit = pd.get_dummies(data['Permit Type'],prefix='Permit Type',drop_first=True)
    Action = pd.get_dummies(data['Action Type'],prefix='Action Type',drop_first=True)
    Work = pd.get_dummies(data['Work Type'],prefix='Work Type',drop_first=True)
    Statu = pd.get_dummies(data['Status'],prefix='Status',drop_first=True)
    dummy_encode =  pd.concat([Permit,Action,Work,Statu],axis=1)
    return dummy_encode

In [None]:
def date_features(data):
    completed_Submission = ~data['Application Date'].isna()
    Issue_data = ~data['Issue Date'].isna()
    Under_review = (completed_Submission == True) & (Issue_data == False)
    Final_Date = ~data['Final Date'].isna()
    Under_inspection  = (Issue_data == True) & (Final_Date == False)
    not_issued = data['Expiration Date'].isna()
    total_date = (pd.to_datetime(data['Expiration Date']) - pd.to_datetime(data['Application Date'])).astype('timedelta64[D]') 
    total_date[total_date.isna()] =  max(total_date)*2
    date_variables = pd.concat([completed_Submission,Issue_data,Under_review,Final_Date,Under_inspection,not_issued,total_date],axis=1)
    return date_variables

In [None]:
def cluster_labels(data,c1,c2):
    X1 = data['Longitude'].fillna(0)
    X2 = data['Latitude'].fillna(0)
    X = pd.concat([X1,X2],axis=1)
    kmeanModel = KMeans(n_clusters=c1)
    kmeanModel.fit(X)
    gmm = GaussianMixture(n_components=c2)
    gmm.fit(X)
    return kmeanModel,gmm

In [None]:
def contractor_freq(data):
    Freq = data['Contractor'].fillna(0)
    Freq[Freq == 'SEATTLE HOUSING AUTH GENERAL'] = 1
    Freq[Freq == 'SEATTLE SCHOOL DISTRICT (A&S)'] = 2
    Freq[Freq == 'U OF W BUILDING PERMIT'] = 3
    Freq[Freq == 'SEATTLE PARKS DEPT'] = 4
    Freq[(Freq == 'BURGESS DESIGN, INC') | (Freq == 'CITY OF SEA F&FD') | 
        (Freq == 'SAGE HOMES NORTHWEST, LLC')| (Freq == 'IA/INTERIOR ARCHITECTS')  |(Freq == 'SOUND SEISMIC')| (Freq == 'AMAZON.COM')] = 5
    Freq[(Freq == 'PORT OF SEATTLE ENGINEERING') | (Freq == 'BLANKET: BANK OF AMERICA TOWER') | 
        (Freq == 'CITY INVESTORS')| (Freq == 'GREEN CANOPY HOMES')  |(Freq == 'POLYGON WLH LLC')] = 6
    Freq[~((Freq == 1) | (Freq == 2) | (Freq == 3) | (Freq == 0) | (Freq == 4) | (Freq == 5) | (Freq == 6))]= 7 
    return Freq

In [None]:
def scaling(data):
    scaler = StandardScaler()
    scaled=data['Master Use Permit'].fillna(0)
    scaler.fit(scaled.reshape(-1,1))
    return scaler

In [None]:
def processed_data(data):
    date_variables = date_features(data)
    Freq = pd.DataFrame({'Frequ': contractor_freq(data)})
    Processed_data = pd.concat([date_variables,Freq],axis=1)
    return Processed_data

In [None]:
Train_topic = Topic_features[0:len(data2)]
Test_topic = Topic_features[len(data2):len(data2)+len(data1)]
Train_topic.drop(Train_topic.columns[0],axis=1,inplace=True)
Test_topic.drop(Test_topic.columns[0],axis=1,inplace=True)
Test_topic.reset_index(inplace=True)
kmeanModel, gmm = cluster_labels(data,3,5)
X1 = data['Longitude'].fillna(0)
X2 = data['Latitude'].fillna(0)
X = pd.concat([X1,X2],axis=1)
Cluster_L1 = kmeanModel.predict(X)
Cluster_L2 = gmm.predict(X)
clusters_features = pd.concat([pd.DataFrame({'KNN': Cluster_L1}),pd.DataFrame({'EM':Cluster_L2}),X],axis=1)
scaler = scaling(data)
data2 = data.drop('Category', axis=1) 
data_temp = pd.concat([data2,data1],axis=0)
dummy_encode_train = dummies_Encoding(data_temp)
dummy_encode = dummy_encode_train.iloc[0:len(data2)]
dummy_encode_test = dummy_encode_train.iloc[len(data2):len(data2)+len(data1)]
scaled_Mast = scaler.transform(data['Master Use Permit'].fillna(0).reshape(-1,1))
scaled_Mast = pd.DataFrame(scaled_Mast)


ros = RandomOverSampler(random_state=9)
rus = RandomUnderSampler(random_state=8)
smote = SMOTE(random_state=9, kind="borderline2")
Processed_data1 = pd.concat([dummy_encode,processed_data(data),clusters_features,scaled_Mast,Train_topic],axis=1)
#undersampling and over sampling

X_sample_0, y_sample_0 = ros.fit_sample(Processed_data1, data['Category'])
X_sample_s, y_sample_s = smote.fit_sample(Processed_data1, data['Category'])
X_sample_u, y_sample_u =  rus.fit_sample(Processed_data1, data['Category'])
X_train_O,X_test_O,y_train_O,y_test_O = train_test_split(X_sample_0,y_sample_0,test_size = 0.3)
X_train_s,X_test_s,y_train_s,y_test_s = train_test_split(X_sample_s,y_sample_s,test_size = 0.3)
X_train_u,X_test_u,y_train_u,y_test_u = train_test_split(X_sample_u,y_sample_u,test_size = 0.3)    

In [None]:
#Random Forest Classifier 
def gridfunc(classifier, parameter, X_train, y_train):
    clf = classifier
    np.random.seed(9)
    parameters = parameter
    f1_scorer = make_scorer(f1_score,average='weighted')
    # Run the grid search
    grid_obj = GridSearchCV(clf, parameters, scoring=f1_scorer)
    grid_obj = grid_obj.fit(X_train, y_train) 
    return grid_obj          
def hp_cv_scores(grid_obj):
    grid_obj.cv_results_
    mean_test_scores = grid_obj.cv_results_['mean_test_score']
    mean_train_scores = grid_obj.cv_results_['mean_train_score']
    plt.figure(figsize=(10,6)) 
    param_values =[str(x) for x in list(grid_obj.param_grid.items())[0][1]]
    x = np.arange(1, len(param_values)+1)
    plt.plot(x,mean_train_scores, c='r', label='Train set')
    plt.xticks(x,param_values)
    plt.plot(x,mean_test_scores,c='g', label='Test set')
    plt.xlabel(list(grid_obj.param_grid.items())[0][1])
    plt.ylabel('mean scores')
    plt.legend()
    plt.show()
classifier = RandomForestClassifier(random_state=9)
grid = gridfunc(classifier, {'n_estimators': [10, 40, 60]} , X_sample_0, y_sample_0)
hp_cv_scores(grid)

In [None]:
#Test data preparation
scaled_Mast1 = scaler.transform(data1['Master Use Permit'].fillna(0).reshape(-1,1))
scaled_Mast1 = pd.DataFrame(scaled_Mast1)
X1_t = data1['Longitude'].fillna(0)
X2_t = data1['Latitude'].fillna(0)
X_t = pd.concat([X1_t,X2_t],axis=1)
Cluster_L1_t = kmeanModel.predict(X_t)
Cluster_L2_t = gmm.predict(X_t)
clusters_features_t = pd.concat([pd.DataFrame({'KNN': Cluster_L1_t}),pd.DataFrame({'EM':Cluster_L2_t}),X_t],axis=1)
test_processed = pd.concat([dummy_encode_test,processed_data(data1),clusters_features_t,scaled_Mast1,Test_topic],axis=1)
test_processed.drop(['index'], axis=1,inplace = True)


In [None]:
Processed_data1.to_csv('Training.csv')
test_processed.to_csv('Test.csv')

In [None]:
#Prediction for test data
random_forest =  grid.predict(test_processed)
RF_prediction = pd.concat([data1['Application/Permit Number'],pd.DataFrame({'Category':random_forest})],axis=1)
RF_prediction.to_csv('prediction4.csv', sep=',',index = False)

In [None]:
#Boosting technique
model = XGBClassifier(n_estimators=1000)
model.fit(X_train_O, y_train_O)
y_pred = model.predict(X_test_O)
f1_score(y_test_O, y_pred, average='weighted')
Boosting =  model.predict(test_processed.as_matrix())
Boosting_prediction = pd.concat([data1['Application/Permit Number'],pd.DataFrame({'Category':Boosting})],axis=1)
Boosting_prediction.to_csv('prediction2.csv', sep=',',index = False)

**Feature extractions from Text**

In [None]:
import os
import pandas as pd
import numpy as np
import csv
from pandas import DataFrame
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from collections import Counter
from string import punctuation
import nltk
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import f1_score, make_scorer
from string import punctuation
from operator import itemgetter
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import NMF, LatentDirichletAllocation
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import word_tokenize, pos_tag, pos_tag_sents
ps = PorterStemmer()
import string 
from collections import Counter
from nltk import word_tokenize, pos_tag, pos_tag_sents
import matplotlib.pyplot as plt
import re
from imblearn.over_sampling import RandomOverSampler

In [None]:
data = pd.read_csv("train_file.csv")
data1 = pd.read_csv("test_file.csv")
data_temp1 = pd.concat([data,data1],axis=0)
data_temp1['Description'].fillna('No address',inplace= True)
data_temp1.shape
data_temp1['POS'] = pos_tag_sents(data_temp1['Description'].apply(word_tokenize).tolist())
data_temp1['noun_words'] = np.nan
data_temp1['noun_words_sentec'] = np.nan
data_temp1 = data_temp1.astype('object')

In [None]:
for j in range(len(data_temp1)):
    tagged = data_temp1.iloc[j,20]
    noun_words = []
    for item in tagged:
        if item[1] == 'N' or item[1] == 'NN' or item[1] == 'NNP' or item[1] == 'NNS' or item[1] == 'NNPS':
            noun_words.append(item[0])
    data_temp1.iloc[j,21] = noun_words

In [None]:
for j in range(len(data_temp1)):
    temp = ' '.join(word for word in data_temp1.iloc[j,21])
    data_temp1.iloc[j,22] = temp

In [None]:
no_features = 1000 #top 1000 features are selected
tf_vectorizer = CountVectorizer(max_df=0.95,min_df=0.02,stop_words='english')
tf = tf_vectorizer.fit_transform(data_temp1['noun_words_sentec'])
tf_feature_names = tf_vectorizer.get_feature_names()

In [None]:
# Define Search Param
search_params = {'n_components': [5, 10, 15, 20, 30], 'learning_decay': [.5, .7, .9]}

# Init the Model
lda = LatentDirichletAllocation()

# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)

# Do the Grid Search
model.fit(tf)

In [None]:
ros = RandomOverSampler(random_state=9)
classifier = RandomForestClassifier(n_estimators=10)
ntopics = [5,10,20,40]
fscore = []
for i in range(len(ntopics)):
    lda_model = LatentDirichletAllocation(n_topics=ntopics[i],               # Number of topics
                                          max_iter=10,               # Max learning iterations
                                          learning_method='online',   
                                          random_state=100,          # Random state
                                          batch_size=128,            # n docs in each learning iter
                                          evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                          n_jobs = -1,               # Use all available CPUs
                                         )
    lda_output = lda_model.fit_transform(tf)
    Topic_distribution = DataFrame(data=lda_output)
    X_sample_0, y_sample_0 = ros.fit_sample(Topic_distribution.iloc[0:len(data)], data['Category'])
    X_train_O,X_test_O,y_train_O,y_test_O = train_test_split(X_sample_0,y_sample_0,test_size = 0.3)
    classifier.fit(X_train_O,y_train_O)
    y_pred = classifier.predict(X_test_O)
    fscore.append(f1_score(y_test_O, y_pred, average='weighted'))

In [None]:
lda_model = LatentDirichletAllocation(n_topics=10,               # Number of topics
                                          max_iter=10,               # Max learning iterations
                                          learning_method='online',   
                                          random_state=100,          # Random state
                                          batch_size=128,            # n docs in each learning iter
                                          evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                          n_jobs = -1,               # Use all available CPUs
                                         )
lda_output = lda_model.fit_transform(tf)
Topic_distribution = DataFrame(data=lda_output)
Topic_distribution.to_csv("Topic_distribution.csv")

In [None]:
X_sample_0, y_sample_0 = ros.fit_sample(Topic_distribution.iloc[0:len(data)], data['Category'])
classifier.fit(X_sample_0,y_sample_0)
y_pred = classifier.predict(Topic_distribution.iloc[len(data):len(data)+len(data1)])

In [None]:
Topic_prediction = pd.concat([data1['Application/Permit Number'],pd.DataFrame({'Category':y_pred})],axis=1)
Topic_prediction.to_csv('prediction3.csv', sep=',',index = False)
Topic_distribution.rename(columns={0:'Topic0',1:'Topic1',2:'Topic2',
                                  3:'Topic3',4:'Topic4',5:'Topic5',
                                  6:'Topic6',7:'Topic7',8:'Topic8',
                                  9:'Topic9'}, inplace = True)
Topic_distribution.to_csv("Topic_distribution.csv")

**Models**

In [None]:
import os
import pandas as pd
import numpy as np
import csv
from pandas import DataFrame
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import f1_score, make_scorer
from xgboost import XGBClassifier
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split, GridSearchCV
from keras import layers
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from sklearn import svm   
from imblearn.under_sampling import RandomUnderSampler


In [None]:
data = pd.read_csv("train_file.csv")
data1 = pd.read_csv("test_file.csv")
Train_features = pd.read_csv("Training.csv")
Test_features = pd.read_csv("Test.csv")
ros = RandomOverSampler(random_state=9)
X_sample_0, y_sample_0 = ros.fit_sample(Train_features, data['Category'])
X_train_O,X_test_O,y_train_O,y_test_O = train_test_split(X_sample_0,y_sample_0,test_size = 0.3)

In [None]:
rus = RandomUnderSampler(random_state=8)
X_sample_u, y_sample_u =  rus.fit_sample(Train_features, data['Category'])
X_train_u,X_test_u,y_train_u,y_test_u = train_test_split(X_sample_u,y_sample_u,test_size = 0.3)    

In [None]:
def svc_param_selection(X, y, nfolds):
    Cs = [0.001, 0.01, 0.1, 1, 10]
    gammas = [0.001, 0.01, 0.1, 1]
    param_grid = {'C': Cs, 'gamma' : gammas}
    grid_search = GridSearchCV(svm.SVC(kernel='rbf'), param_grid, cv=nfolds)
    grid_search.fit(X, y)
    grid_search.best_params_
    return grid_search.best_params_
rbf = svm.SVC(kernel='rbf', gamma=0.01, C=0.01).fit(X_train_O, y_train_O)
y_pred = rbf.predict(X_test_O)
f1_score(y_test_O, y_pred, average='weighted')

In [None]:
svm =  rbf.predict(Test_features.as_matrix())
svm_prediction = pd.concat([data1['Application/Permit Number'],pd.DataFrame({'Category':svm})],axis=1)
svm_prediction.to_csv('prediction12.csv', sep=',',index = False)

In [None]:
svm =  rbf.predict(Test_features.as_matrix())
svm_prediction = pd.concat([data1['Application/Permit Number'],pd.DataFrame({'Category':svm})],axis=1)
svm_prediction.to_csv('prediction12.csv', sep=',',index = False)

In [None]:
model = XGBClassifier(n_estimators=15000,n_jobs=-1)
model.fit(X_train_O, y_train_O)
y_pred = model.predict(X_test_O)
f1_score(y_test_O, y_pred, average='weighted')

In [None]:
Boosting =  model.predict(Test_features.as_matrix())
Boosting_prediction = pd.concat([data1['Application/Permit Number'],pd.DataFrame({'Category':Boosting})],axis=1)
Boosting_prediction.to_csv('prediction12.csv', sep=',',index = False)

In [None]:
# Deep learning model
def keras_model():
    model1 = Sequential()
    model1.add(Dense(250, input_dim = np.int(X_train_O.shape[1]),activation ='relu'))
    model1.add(Dense(250,activation ='relu'))
    model1.add(Dropout(0.5))
    model1.add(Dense(250,activation ='relu'))
    model1.add(Dropout(0.5))
    model1.add(Dense(250,activation ='relu'))
    model1.add(Dense(5,activation='softmax'))
    model1.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model1
encoder = LabelEncoder()
encoder.fit(y_sample_0)
encoded_Y = encoder.transform(y_sample_0)
dummy_y = np_utils.to_categorical(encoded_Y)
estimator = KerasClassifier(build_fn=keras_model, epochs=1, batch_size=5, verbose=0)
kfold = KFold(n_splits=3, shuffle=True, random_state=50)
results = cross_val_score(estimator, X_sample_0, dummy_y, cv=kfold)
print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

In [None]:
pred1 = pd.read_csv("prediction1.csv")
pred2 = pd.read_csv("prediction2.csv")
pred3 = pd.read_csv("prediction3.csv")
pred4 = pd.read_csv("prediction4.csv")
pred5 = pd.read_csv("prediction5.csv")
pred6 = pd.read_csv("prediction10.csv")
pred7 = pd.read_csv("prediction7.csv")
final_pred = pd.concat([pred2['Category'],
                       pred2['Category'],
                       pred3['Category'],
                       pred4['Category'],
                       pred5['Category'],
                       pred6['Category'],
                       pred7['Category']],axis=1)
labels = []
for i in range(len(final_pred)):
    labels.append(Counter(final_pred.loc[i]).most_common(1)[0][0])
Max_prediction = pd.concat([data1['Application/Permit Number'],pd.DataFrame({'Category':labels})],axis=1)
Max_prediction.to_csv('prediction11.csv', sep=',',index = False)