In [2]:
import pandas as pd
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score,f1_score,confusion_matrix
from sklearn.model_selection import GridSearchCV


In [3]:
dataset = pd.read_csv('Dataset.csv')

In [4]:
dataset.head()

Unnamed: 0,gre_score,gre_score_quant,gre_score_verbal,test_score_toefl,undergraduation_score,work_ex,papers_published,university_name,status,ranking
0,316,164,152,106.0,2.9,9,3,northeastern_university,accept,15
1,314,163,151,114.0,3.53,7,3,northeastern_university,accept,15
2,307,160,147,88.0,3.0,36,0,northeastern_university,accept,15
3,330,169,161,113.0,2.4,48,3,northeastern_university,accept,15
4,316,163,153,99.0,2.71,30,0,northeastern_university,accept,15


In [5]:
dataset.university_name.value_counts()

northeastern_university                        1653
state_university_of_new_york_at_stony_brook     602
north_carolina_state_university_raleigh         588
syracuse_university                             518
university_of_texas_dallas                      474
illinois_institute_of_technology                435
university_of_california_irvine                 400
texas_a_m_university_college_station            387
university_of_north_carolina_at_charlotte       380
indiana_university_bloomington                  343
university_of_colorado_boulder                  341
rochester_institute_of_technology               341
new_york_university                             318
university_of_texas_arlington                   312
rutgers_university_new_brunswick                304
george_mason_university                         264
university_of_maryland_college_park             264
university_of_cincinnati                        242
university_of_texas_austin                      233
carnegie_mel

In [6]:
target_universities=dataset.university_name.unique().tolist()

In [7]:
from sklearn.utils import resample

In [8]:
resampled_dfs=[]
resampled_df = pd.DataFrame()
for each in target_universities:
    if dataset[(dataset.university_name==each )].shape[0]> 600:
        
        resampled_dfs.append(resample(dataset[(dataset.university_name==each )&(dataset.status=='accept')],replace=True,n_samples=300,random_state=123))
        resampled_dfs.append(resample(dataset[(dataset.university_name==each) &(dataset.status=='reject')],replace=True,n_samples=300,random_state=123))
        
    elif dataset[(dataset.university_name==each )].shape[0] < 200:
        resampled_dfs.append(resample(dataset[(dataset.university_name==each )&(dataset.status=='accept')],replace=True,n_samples=125,random_state=123))
        resampled_dfs.append(resample(dataset[(dataset.university_name==each) &(dataset.status=='reject')],replace=True,n_samples=125,random_state=123))
    else:
        resampled_dfs.append(dataset[(dataset.university_name==each )&(dataset.status=='accept')])
        resampled_dfs.append(dataset[(dataset.university_name==each )&(dataset.status=='reject')])
        

resampled_df = pd.concat( [ f for f in resampled_dfs ] )

In [9]:
training, testing = train_test_split(dataset, test_size=0.25, random_state=5, stratify=dataset[['university_name', 'status']])
print(testing[0:5])

      gre_score  gre_score_quant  gre_score_verbal  test_score_toefl  \
4603        314              168               146             109.0   
7734        300              151               149             101.0   
828         306              163               143             101.0   
5472        313              164               149             108.0   
6108        309              161               148             106.0   

      undergraduation_score  work_ex  papers_published  \
4603                   3.26       13                 0   
7734                   2.72       32                 0   
828                    2.38       45                 0   
5472                   2.35       21                 0   
6108                   2.10       53                 0   

                                  university_name  status  ranking  
4603            rochester_institute_of_technology  accept       66  
7734    university_of_north_carolina_at_charlotte  reject       30  
828        

In [10]:
numerical_data = training.select_dtypes(include = ['int64','float','uint8'])
categorical_data = training.select_dtypes(include = ['object'])
categorical_features = categorical_data.columns.values
numerical_features = numerical_data.columns.values
numerical_features
print(numerical_features)

['gre_score' 'gre_score_quant' 'gre_score_verbal' 'test_score_toefl'
 'undergraduation_score' 'work_ex' 'papers_published' 'ranking']


In [11]:
def get_result(model, X_train, X_test, Y_train, Y_test):
    print(X_test[0:5])
    sc = StandardScaler() 
    sc.fit(X_train)
    X_train = sc.transform(X_train)
    X_test = sc.transform(X_test) 
    model.fit(X_train, Y_train)
   
    y_pred = model.predict(X_test)
    y_train_pred = model.predict(X_train)
    prob_test=pd.DataFrame(model.predict_proba(X_test))
    prob_train=pd.DataFrame(model.predict_proba(X_train))
    test_f1_score = f1_score(Y_test, y_pred,pos_label='accept')
    train_f1_score = f1_score(Y_train, y_train_pred,pos_label='accept')
    train_accuracy=accuracy_score(Y_train, y_train_pred)
    test_accuracy=accuracy_score(Y_test, y_pred)
    test_cm = confusion_matrix(Y_test, y_pred,labels=['accept','reject'])
    train_cm = confusion_matrix(Y_train, y_train_pred,labels=['accept','reject'])
    model_name.append(model)
    model_train_acc.append(train_accuracy)
    model_test_accuracy.append(test_accuracy)
    model_test_f1.append(test_f1_score)
    model_train_f1.append(train_f1_score)
    return [train_cm,test_cm,train_accuracy,test_accuracy,train_f1_score, test_f1_score, prob_train,prob_test, y_pred,y_train_pred, model,sc]

In [12]:
model_name=[]
model_train_acc=[]
model_test_accuracy=[]
model_train_f1=[]
model_test_f1=[]

In [13]:
from sklearn.svm import SVC
import pickle
import numpy as np
svc_model=SVC(gamma='auto',probability=True)
sc = StandardScaler()
X_train = training[numerical_features] 
#X_test= testing[numerical_features]
#Y_train = training['status']
#Y_test = testing['status']

#sc.fit(X_train)
#X_train = sc.transform(X_train)
#X_test = sc.transform(X_test) 
#svc_model.fit(X_train, Y_train)

string='314 168 146 109.0 3.26 13 0 66'
    

data = string.split()
print(data)
print("Type:", type(data))
print("Length:", len(data))

for i in range(8):
    print(data[i])
data = [float(x.strip()) for x in data]

for i in range(8):
    print(data[i])

data_np = np.asarray(data, dtype = float)
data_np = data_np.reshape(1,-1)
    #out, acc, t = predict_svm(clf, data_np)
sc.fit_transform(X_train) 

inp = sc.transform(data_np)




 



   
#y_pred = svc_model_results[11].predict(X_test[0:1])
svc_model_results=get_result(svc_model,training[numerical_features],testing[numerical_features],training['status'],testing['status'])
y_pred = svc_model_results[10].predict(inp)
# save the model to disk
filename = 'svc_model.pickel'
pickle.dump(svc_model_results[11], open(filename, 'wb'))
#print(X_test)

print(training[numerical_features])
print(y_pred)

['314', '168', '146', '109.0', '3.26', '13', '0', '66']
Type: <class 'list'>
Length: 8
314
168
146
109.0
3.26
13
0
66
314.0
168.0
146.0
109.0
3.26
13.0
0.0
66.0
      gre_score  gre_score_quant  gre_score_verbal  test_score_toefl  \
4603        314              168               146             109.0   
7734        300              151               149             101.0   
828         306              163               143             101.0   
5472        313              164               149             108.0   
6108        309              161               148             106.0   

      undergraduation_score  work_ex  papers_published  ranking  
4603                   3.26       13                 0       66  
7734                   2.72       32                 0       30  
828                    2.38       45                 0       89  
5472                   2.35       21                 0       35  
6108                   2.10       53                 0      118  
      gre_

In [14]:
print('test_accuracy:',svc_model_results[3]*100)
print('train_accuracy:',svc_model_results[2]*100)
print('test_f1_score:',svc_model_results[5]*100)
print('train_f1_score:',svc_model_results[4]*100)

test_accuracy: 85.49957228400342
train_accuracy: 67.79806046776955
test_f1_score: 78.26055612770341
train_f1_score: 60.947768938083705


In [15]:
from sklearn.tree import DecisionTreeClassifier
decision_tree_model=DecisionTreeClassifier()

decision_tree_model_results=get_result(decision_tree_model,training[numerical_features],testing[numerical_features],training['status'],testing['status'])

      gre_score  gre_score_quant  gre_score_verbal  test_score_toefl  \
4603        314              168               146             109.0   
7734        300              151               149             101.0   
828         306              163               143             101.0   
5472        313              164               149             108.0   
6108        309              161               148             106.0   

      undergraduation_score  work_ex  papers_published  ranking  
4603                   3.26       13                 0       66  
7734                   2.72       32                 0       30  
828                    2.38       45                 0       89  
5472                   2.35       21                 0       35  
6108                   2.10       53                 0      118  


In [16]:
print('test_accuracy:',decision_tree_model_results[3]*100)
print('train_accuracy:',decision_tree_model_results[2]*100)
print('test_f1_score:',decision_tree_model_results[5]*100)
print('train_f1_score:',decision_tree_model_results[4]*100)

test_accuracy: 87.59538066723695
train_accuracy: 100.0
test_f1_score: 85.05890473999081
train_f1_score: 100.0


In [17]:
from sklearn.ensemble import RandomForestClassifier
random_forest_model=RandomForestClassifier(n_estimators=10)

random_forest_model_results=get_result(random_forest_model,training[numerical_features],testing[numerical_features],training['status'],testing['status'])

print('test_accuracy:',random_forest_model_results[3]*100)
print('train_accuracy:',random_forest_model_results[2]*100)
print('test_f1_score:',random_forest_model_results[5]*100)
print('train_f1_score:',random_forest_model_results[4]*100)

      gre_score  gre_score_quant  gre_score_verbal  test_score_toefl  \
4603        314              168               146             109.0   
7734        300              151               149             101.0   
828         306              163               143             101.0   
5472        313              164               149             108.0   
6108        309              161               148             106.0   

      undergraduation_score  work_ex  papers_published  ranking  
4603                   3.26       13                 0       66  
7734                   2.72       32                 0       30  
828                    2.38       45                 0       89  
5472                   2.35       21                 0       35  
6108                   2.10       53                 0      118  
test_accuracy: 90.46107784431138
train_accuracy: 98.37421563034798
test_f1_score: 89.01075268817205
train_f1_score: 98.26378312519036
