# Let's read all data (text and numerical)

In [1]:
import math
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# Read word count data
fp = open("../data/text_count.csv")
lines = fp.readlines()
unique_words = dict()
for line in lines:
    line = line.strip().split(',')
    unique_words[line[0]] = [np.log(float(line[1]))]
    
    
# Read pre-processed text data
fp = open("../data/text_pre-processed.csv")
lines = fp.readlines()
docs = dict()
for index, line in enumerate(lines):
    line = line.strip().split(',')
    docs[line[0]] = line[1:]
    
def sigmoid(x):
    return 1/(1+math.exp(-x))


# Training data
fp = open("../data/train_pruned.csv")
lines = fp.readlines()[1:]
X_train, X_train_text, Y, span_train = list(), list(), list(), list()
fp.close()

for index, line in enumerate(lines):
    line = line.strip().split(',')
    row = list()
    counter = 0
    for i in line[3:]:
        if len(i) < 1:
            row.append(-999)
            counter+=1
        else:
            row.append(float(i))
    id_ = line[0]
    tmp = float(line[2])
    Y.append(tmp)
    span_train.append(float(line[1]))
    X_train.append([sigmoid(span_train[-1])]+row+unique_words.get(line[0], [0.0]))
    X_train_text.append(docs.get(line[0], ['0']))

X_train, Y, span_train = np.array(X_train), np.array(Y), np.array(span_train)
print("Training Shape: ", X_train.shape, Y.shape, span_train.shape)


# Testing data
fp = open("../data/test.csv")
lines = fp.readlines()[1:]
X_test, X_test_text, ids, span_test = list(), list(), list(), list()
fp.close()

for line in lines:
    line = line.strip().split(',')
    row = list()
    counter = 0
    for i in line[2:]:
        if len(i) < 1:
            row.append(-999)
            counter+=1
        else:
            row.append(float(i))
    ids.append(line[0])
    span_test.append(float(line[1]))
    X_test.append([sigmoid(span_test[-1])]+row+unique_words.get(line[0], [0.0]))
    X_test_text.append(docs.get(line[0], ['0']))
                        

X_test, span_test = np.array(X_test), np.array(span_test)
print("Test Shape: ", X_test.shape, span_test.shape)

Training Shape:  (4176, 362) (4176,) (4176,)
Test Shape:  (1000, 362) (1000,)


# First we will compute weighted average of the best features with LightGBM

In [2]:
from scipy.stats import pearsonr
from lightgbm import LGBMRegressor
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA


# Feature_type0
x_train = np.concatenate((X_train[:,:61], X_train[:,-1:]), axis=1)
x_test = np.concatenate((X_test[:,:61], X_test[:,-1:]), axis=1)
kmeans = KMeans(n_clusters=4, random_state=0, n_jobs=-1)
x_train_k = kmeans.fit_transform(x_train)
x_test_k = kmeans.transform(x_test)

model = LGBMRegressor(max_depth=7, learning_rate=0.01, n_estimators=600, missing=-999)
model.fit(np.concatenate((x_train, x_train_k), axis=1), Y)
preds_type0 = model.predict(np.concatenate((x_test, x_test_k), axis=1))

# Feature_type1
x_train = np.concatenate((X_train[:,:1], X_train[:,61:121]), axis=1)
x_test = np.concatenate((X_test[:,:1], X_test[:,61:121]), axis=1)

model = LGBMRegressor(max_depth=6, learning_rate=0.01, n_estimators=556, missing=-999)
model.fit(x_train, Y)
preds_type1 = model.predict(x_test)


# Feature_type5
x_train = np.concatenate((X_train[:,:1], X_train[:,301:]), axis=1)
x_test = np.concatenate((X_test[:,:1], X_test[:,301:]), axis=1)

model = LGBMRegressor(max_depth=6, learning_rate=0.01, n_estimators=556, missing=-999)
model.fit(x_train, Y)
preds_type5 = model.predict(x_test)


# Final weighting
preds_weighted = list()
for a, b, c in zip(preds_type0, preds_type1, preds_type5):
    preds_weighted.append((a*0.15)+(b*0.75)+(c*0.1))

submit_array = np.array([[i, j] for i, j in zip(ids, preds_weighted)])
submit_array = np.insert(submit_array, -1, np.char.lower(submit_array[:,0]), axis=1)
submit_array = submit_array[submit_array[:,1].argsort()[::-1]]
submit_array = np.delete(submit_array, -2, axis=1)

# Finally, write out the data
fp = open("../data/submit.csv", "w")
fp.write("id,target\n")
for i, j in submit_array:
    fp.write(i+","+j+"\n")
fp.close()

# Finally we will compute weighted average of previous weighted average with another LightGBM

In [3]:
from lightgbm import LGBMRegressor
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor




import math
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

def sigmoid(x):
    return 1/(1+math.exp(-x))


# Training data
fp = open("../data/train_pruned.csv")
lines = fp.readlines()[1:]
X_train, Y, span_train = list(), list(), list()
fp.close()

for index, line in enumerate(lines):
    line = line.strip().split(',')
    row = list()
    counter = 0
    for i in line[3:]:
        if len(i) < 1:
            row.append(np.nan)
            counter+=1
        else:
            row.append(float(i))
    id_ = line[0]
    if float(line[1]) <= 7:
        span_train.append(float(line[1]))
        Y.append(float(line[2]))
        X_train.append([sigmoid(span_train[-1])]+row)

X_train, Y, span_train = np.array(X_train), np.array(Y), np.array(span_train)
print("Training Shape: ", X_train.shape, Y.shape, span_train.shape)


# Testing data
fp = open("../data/test.csv")
lines = fp.readlines()[1:]
X_test, ids_small, span_test = list(), list(), list()
fp.close()

for line in lines:
    line = line.strip().split(',')
    row = list()
    counter = 0
    for i in line[2:]:
        if len(i) < 1:
            row.append(np.nan)
            counter+=1
        else:
            row.append(float(i))
    if float(line[1]) <= 7:
        span_test.append(float(line[1]))
        ids_small.append(line[0])
        X_test.append([sigmoid(span_test[-1])]+row)
                        

X_test, span_test = np.array(X_test), np.array(span_test)
print("Test Shape: ", X_test.shape, span_test.shape)


Training Shape:  (965, 361) (965,) (965,)
Test Shape:  (230, 361) (230,)


In [4]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train = scaler.fit_transform(X_train)
transformations = np.zeros((X_train.shape[1], 9))
for i in range(X_train.shape[1]):
    transformations[i,0] = abs(np.corrcoef(X_train[:,i], Y)[0,1])
    transformations[i,1] = abs(np.corrcoef(np.sin(X_train[:,i]), Y)[0,1])
    transformations[i,2] = abs(np.corrcoef(np.cos(X_train[:,i]), Y)[0,1])
    transformations[i,3] = abs(np.corrcoef(1/(1 + np.exp(-X_train[:,i])), Y)[0,1])
    transformations[i,4] = abs(np.corrcoef(np.power(X_train[:,i], 2), Y)[0,1])
    transformations[i,5] = abs(np.corrcoef(np.power(X_train[:,i], 3), Y)[0,1])
    transformations[i,6] = abs(np.corrcoef(np.power(X_train[:,i], 4), Y)[0,1])
    transformations[i,7] = abs(np.corrcoef(x_train[:,i], Y)[0,1])
    transformations[i,8] = abs(np.corrcoef(np.log(np.absolute(X_train[:,i])-np.mean(X_train[:,i])+1), Y)[0,1])

ind_trans = np.argmax(transformations, axis=1)
ind_trans

array([6, 6, 1, 1, 4, 1, 6, 6, 5, 6, 5, 5, 4, 6, 8, 5, 4, 1, 5, 1, 8, 5,
       8, 6, 4, 2, 1, 5, 2, 1, 5, 1, 1, 5, 4, 2, 6, 5, 1, 5, 2, 1, 8, 5,
       6, 4, 8, 6, 1, 5, 6, 4, 6, 5, 2, 6, 5, 5, 8, 1, 4, 8, 2, 8, 1, 0,
       8, 6, 2, 5, 8, 2, 2, 1, 4, 4, 5, 8, 8, 0, 2, 8, 4, 2, 5, 8, 8, 6,
       2, 1, 8, 6, 7, 4, 6, 8, 4, 8, 6, 2, 2, 2, 6, 5, 2, 2, 8, 2, 5, 2,
       6, 6, 5, 2, 6, 8, 8, 4, 6, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [5]:
def simple(X):
    return X

def sin(X):
    return np.sin(X)

def cos(X):
    return np.cos(X)

def sigmoid(X):
    return 1/(1 + np.exp(-X))

def square(X):
    return np.power(X, 2)

def cube(X):
    return np.power(X, 3)

def four(X):
    return np.power(X, 4)

def log_trans(X):
    return np.log(np.absolute(X)-np.mean(X)+1)

In [6]:
from sklearn.preprocessing import StandardScaler


scaler = StandardScaler()
x_train = scaler.fit_transform(X_train)
x_test = scaler.transform(X_test)


for i in range(X_train.shape[1]):
    if ind_trans[i] == 0:
        X_train[:,i] = simple(X_train[:,i])
        X_test[:,i] = simple(X_test[:,i])
    elif ind_trans[i] == 1:
        X_train[:,i] = sin(X_train[:,i])
        X_test[:,i] = sin(X_test[:,i])
    elif ind_trans[i] == 2:
        X_train[:,i] = cos(X_train[:,i])
        X_test[:,i] = cos(X_test[:,i])
    elif ind_trans[i] == 3:
        X_train[:,i] = sigmoid(X_train[:,i])
        X_test[:,i] = sigmoid(X_test[:,i])
    elif ind_trans[i] == 4:
        X_train[:,i] = square(X_train[:,i])
        X_test[:,i] = square(X_test[:,i])
    elif ind_trans[i] == 5:
        X_train[:,i] = cube(X_train[:,i])
        X_test[:,i] = cube(X_test[:,i])
    elif ind_trans[i] == 6:
        X_train[:,i] = four(X_train[:,i])
        X_test[:,i] = four(X_test[:,i])
    elif ind_trans[i] == 7:
        X_train[:,i] = x_train[:,i]
        X_test[:,i] = x_test[:,i]
    elif ind_trans[i] == 8:
        X_train[:,i] = log_trans(X_train[:,i])
        X_test[:,i] = log_trans(X_test[:,i])

In [7]:
# Feature_type1
model = LGBMRegressor(max_depth=3, learning_rate=0.01, n_estimators=600, max_bin=1000, num_leaves=1000, missing=-999)
model.fit(X_train, Y)
preds_small = model.predict(X_test)

new_preds = list()
for i in range(len(ids)):
    if ids[i] in ids_small:
        preds_weighted[i] = (preds_weighted[i]*0.5) + (preds_small[ids_small.index(ids[i])]*0.5)

submit_array = np.array([[i, j] for i, j in zip(ids, preds_weighted)])
submit_array = np.insert(submit_array, -1, np.char.lower(submit_array[:,0]), axis=1)
submit_array = submit_array[submit_array[:,1].argsort()[::-1]]
submit_array = np.delete(submit_array, -2, axis=1)

# Finally, write out the data
fp = open("../data/final_submit.csv", "w")
fp.write("id,target\n")
for i, j in submit_array:
    fp.write(i+","+j+"\n")
fp.close()