In [1]:
import pandas as pd
# reading in the file and only keeping the relavent features
df = pd.read_csv('Data/Train.csv')
df=df[['content','category']]


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# splitting the data into a training and test set (used for both the ensemble model and the original dataset)
def train_test_split_features(train, test,train_feature, target_feature,vectorise, final_prediction=False):
    y_train = train[target_feature]   
    X_train = train[train_feature]
    X_test = test[train_feature]
    if(final_prediction):
        y_test = test[target_feature]
    else:
        y_test = None
    feature_names=[]
    if(vectorise):
        vect = TfidfVectorizer(min_df=5, ngram_range=(1, 4)) # create Count vectorizer.
        X_train = vect.fit(X_train).transform(X_train) # transform text_train  into a vector 
        X_test = vect.transform(X_test) 
        feature_names = vect.get_feature_names() # to return all words used in vectorizer
  
    return X_train, X_test, y_train, y_test, feature_names


# a method to run the code quickly
quickrun=False
if(quickrun):
    df,_=train_test_split(df, test_size=0.9, random_state=42)


In [3]:
# a quick output of most of the relavent metrics
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, confusion_matrix, roc_auc_score
def print_model_performance(target,predicted):
    print('outcome of training')
    print(classification_report( target,predicted))   #uncomment if you want to see full report 
    print('test average accuracy ',accuracy_score( target,predicted))
    print(confusion_matrix( target,predicted))


In [4]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
import lightgbm as lgb

# a list of all models used
def all_models():
    #Using the recomended classifiers
    #https://arxiv.org/abs/1708.05070
    GBC = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=1, random_state=0)
    RFC = RandomForestClassifier(n_estimators=500, max_features=0.25, criterion="entropy")
    SVM = SVC(C = 0.01, gamma=0.1, kernel="poly", degree=3, coef0=10.0)
    ETC = ExtraTreesClassifier(n_estimators=1000, max_features="log2", criterion="entropy")
    LR = LogisticRegression(C=1.5,fit_intercept=True)
    # Models that were not included in the paper not from SKlearn
    XGC = XGBClassifier()
    CBC = CatBoostClassifier(silent=True)
    light_gb = lgb.LGBMClassifier()
    models=[(LR, "linear_regression"),(ETC, "Extra_tree_classifier"),(SVM, "support_vector_classifier"), (RFC, "random_forest_classifier"), (GBC, "gradient_boosted_classifier"),
             (XGC, "XGBoost"),(light_gb,"Light_GBM"), (CBC, "catboost_classifier")]
    #this subset was selected due to runtime
    models=[(LR, "linear_regression"), (GBC, "gradient_boosted_classifier"),
             (XGC, "XGBoost"),(light_gb,"Light_GBM")]
    return models

In [5]:
from sklearn.model_selection import KFold
# running the dataset with the the model given splitting it into 5 fold cross validation and saving the results
def run_features(df, model,predict_probability=False,features='content', vectorise=True):
    cv = KFold(n_splits=5, random_state=42, shuffle=False)
    full_prediciton=[]
    for train_index, test_index in cv.split(df):
        df_train, df_test = df.loc[train_index], df.loc[test_index]
        X_train, X_test, y_train, y_test, feature_names=train_test_split_features(df_train,df_test,features,'category', vectorise)
        model.fit(X_train, y_train)
        if (predict_probability==True):
            prediction = model.predict_proba(X_test)
        else:
            prediction = model.predict(X_test)
            
        full_prediciton.append(prediction)

    predictions=[]
    for set_of_prediction in full_prediciton:
        for predicted in set_of_prediction:
            predictions.append(predicted)
    return predictions

# saving the probabilities of each of the features so that it can be used to train an ensemble model
def save_feature_probabilities(df_copy,model_predicted_names,model_name,prediction):
    # todo clean up the function a littleA method is a procedure or function in OOPs Concepts. Whereas, a function is a group of reusable code which can be used anywhere in the program. This helps the need for writing the same code again and again. It helps programmers in writing modular codes.01 Jun 2020
    pred_1=[]
    pred_2=[]
    pred_3=[]
    pred_4=[]
    pred_5=[]
    for prediction_1, prediction_2,prediction_3,prediction_4,prediction_5 in prediction:
        pred_1.append(prediction_1)
        pred_2.append(prediction_2)
        pred_3.append(prediction_3)
        pred_4.append(prediction_4)
        pred_5.append(prediction_5)
            
    one_predictions=model_name+'_1'
    two_predictions=model_name+'_2'
    three_predictions=model_name+'_3'
    four_predictions=model_name+'_4'
    five_predictions=model_name+'_5'
    
    df_copy[one_predictions]=pred_1
    df_copy[two_predictions]=pred_2
    df_copy[three_predictions]=pred_3
    df_copy[four_predictions]=pred_4
    df_copy[five_predictions]=pred_5
    
    model_predicted_names.append(one_predictions)
    model_predicted_names.append(two_predictions)
    model_predicted_names.append(three_predictions)
    model_predicted_names.append(four_predictions)
    model_predicted_names.append(five_predictions)
    
    pred_1.clear()
    pred_2.clear()
    pred_3.clear()
    pred_4.clear()
    pred_5.clear()
    
    
    
    return df_copy, model_predicted_names
    


In [None]:
# the original models either by outputting their prediction probabilities for each of the 5 categories or returning the predicted value
model_predicted_names=[]

models=all_models()
df_copy=df.copy()
for model, name in models:
    print(name)
    predict_probability=True
    prediction = run_features(df_copy, model, predict_probability)
    
    if(predict_probability):
        model_name=name+'_prediction'
        df_copy, model_predicted_names=save_feature_probabilities(df_copy,model_predicted_names,model_name,prediction)
    else:
        print_model_performance(df_copy['category'],prediction)
    
df_copy

linear_regression




gradient_boosted_classifier




In [None]:
# using random forrest to create a ensemble model and outputting the results
original_ensemble_model = RandomForestClassifier()
predict_probability=False
predictions=run_features(df_copy,original_ensemble_model,predict_probability=predict_probability, features=model_predicted_names,vectorise=False)


In [None]:
print_model_performance(df_copy['category'],predictions)

In [None]:
df_test_set=pd.read_csv('Data/Test.csv')


In [None]:
# running the model by making use of the entire training set
def running_test_set(df_train, df_test, model,predict_probability=False,features='content', vectorise=True, train_model=True):
    X_train, X_test, y_train, y_test, feature_names=train_test_split_features(df_train,df_test,features,'category', vectorise, False)
    if(train_model):
        model.fit(X_train, y_train)
    if (predict_probability==True):
        prediction = model.predict_proba(X_test)
    else:
        prediction = model.predict(X_test)

    return prediction


In [None]:
model_predicted_names=[]

models=all_models()
df_copy=df.copy()
df_test_copy=df_test_set.copy()

for model, name in models:
    print(name)
    predict_probability=True
    prediction = running_test_set(df_copy,df_test_copy, model, predict_probability)
    
    if(predict_probability):
        model_name=name+'_prediction'
        df_test_copy, model_predicted_names=save_feature_probabilities(df_test_copy,model_predicted_names,model_name,prediction)
    else:
        print_model_performance(df_test_copy['category'],prediction)
    
df_test_copy

In [None]:
# getting the predictions predictions and outputting it in the format specified
prediction = original_ensemble_model.predict(df_test_copy[model_predicted_names])
# if the target is for the best loss and not the best prediction, the prediction values probably perform better
prediction_probabilities = original_ensemble_model.predict_proba(df_test_copy[model_predicted_names])
df_test_copy['predictions']=prediction


In [None]:
# outputting the model
output_df=df_test_copy.copy()
dummies = pd.get_dummies(output_df['predictions'],  drop_first=False)
output_df=pd.concat([output_df, dummies], axis=1)
output_df[['swahili_id','Biashara','Burudani','Kimataifa','Kitaifa','michezo']].to_csv("submission.csv", index=False)
