In [1]:
import pandas as pd
import numpy as np

from data_utils import *


# DATA IMPORT

# your working directory for the code files
import os
cwd = os.getcwd()
path = cwd + '\\Data'


TDPorgs_path = path + '\\organizations.csv'
UKRIorgs_path = path + '\\orgs.csv'

#Auxiliaries
TDPorgs_descr_path = path + '\\organization_descriptions.csv'
orgProjectLinks_path = path + '\\orgProjectsLinks.csv'
projects_path = path + '\\projects.csv'


TDP_orgs = pd.read_csv(TDPorgs_path)
UKRI_orgs = pd.read_csv(UKRIorgs_path)
#Auxiliaries
TDPorgs_descr = pd.read_csv(TDPorgs_descr_path)
orgProjectLinks = pd.read_csv(orgProjectLinks_path).drop(columns=['startdate', 'enddate'])
projects = pd.read_csv(projects_path)[['projectuuid', 'title', 'potentialimpact', 'leadfunder', 'startdate', 'enddate']]

In [None]:
#Cleaning the names
TDP_orgsnames = name_cleaner(TDP_orgs)
UKRI_orgsnames = name_cleaner(UKRI_orgs)

TDP_orgs['Cleannames']= TDP_orgsnames #should be in main code
UKRI_orgs['Cleannames']= UKRI_orgsnames #should be in main code

#removing of same-named companies from TDP
TDP_orgs = company_clean(TDP_orgs)

In [None]:
#get list of common companies
matches = string_matcher(TDP_orgsnames, UKRI_orgsnames)

In [None]:
# get the text descriptions and project titles together for the reduced, joined data-set
orgProjectTexts = info_merger(TDP_orgs, UKRI_orgs,
                              match_table = matches,
                              df1_desc = TDPorgs_descr,
                              df2_info = projects,
                              df_links = orgProjectLinks)

In [None]:
from nlp_utils import *

# # orgProjectTexts has been stored an intermediate file to save running the above:
# path_int = cwd + '\\Intermediate_Files'
# orgProjectTexts_path = path_int + '\\orgProjectTexts.csv'
# orgProjectTexts = pd.read_csv(orgProjectTexts_path)

df = presplit_preprocess(orgProjectTexts)

# split training-validation and test sets
df_trainval, df_test = trainval_test_split(df)

# negative examples for training-data:
df_trainval = trainval_negs(df_trainval, full_data=df)

# all rank combinations to try, for test data:
col_order = df_trainval.columns
df_test = test_combs(df_test, col_order = col_order)

# combine sets again for language processing
df_mixed = pd.concat([df_trainval, df_test], axis=0)

In [None]:
# compute similarity scores
df_mixed_sims = similarity_scores(df_mixed, meth='lsi')

In [None]:
# general preprocessing before predictive modelling
df_mixed_sims = feature_preprocess(df_mixed_sims)

In [1]:
#Binary classification

from model_utils import *


In [None]:
# Stacked ensemble

X_train, X_val, y_train, y_val, df_test, n_orgs = final_preprocessing(df_mixed_sims, model="ensemble")
run_ensemble(X_train, X_val, y_train, y_val, df_test)

In [None]:
# Deep neural network

X_train, X_val, y_train, y_val, df_test, n_orgs = final_preprocessing(df_mixed_sims, model="dnn")
run_dnn(X_train, X_val, y_train, y_val, df_test, n_orgs)

In [None]:
# CNN
X_train, X_val, y_train, y_val, df_test, n_orgs = final_preprocessing(df_mixed_sims, model="cnn")
run_cnn(X_train, X_val, y_train, y_val, df_test, n_steps=5)

In [None]:
# LSTM

org_features = ['orguuid', 'CB_rank', 'projects_count']
proj_features = ['project_length', 'sim', 'proj_month', 'proj_year']

X_train, X_val, y_train, y_val, df_test, n_orgs = final_preprocessing(df_mixed_sims, model="lstm")
run_lstm(X_train, X_val, y_train, y_val, df_test, org_features = org_features, proj_features = proj_features)