# Linear model fitted on text embedding

In [2]:
import pandas as pd

from sklearn import set_config
from sklearn.metrics import mean_squared_error
set_config(transform_output="pandas")

pd.set_option('mode.chained_assignment', None)
pd.options.display.max_rows = 1000

%matplotlib inline


def calc_metrics(submission, dtypes=["train", "test"]):
    result = {}
    for dtype in dtypes:
        name = f"MSE_{dtype}"
        mse = None
        sample = submission[submission["type"] == dtype]
        if not sample["rating"].isnull().all():
            mse = mean_squared_error(sample["rating"], sample["predict_rating"])

        result[name] = mse

    return result

In [3]:
# Your Name Surname
# !!! Please do not change the NAME in other hypotheses
NAME = "Belton_Manhica"
train_and_test = pd.read_csv(f"0_{NAME}.csv")

# split dataset for train and test part
train = train_and_test[train_and_test["type"] == "train"]
test = train_and_test[train_and_test["type"] == "test"]
train_and_test.shape, train.shape, test.shape

((10000, 3), (5000, 3), (5000, 3))

In [3]:
train_and_test.sample(5)

Unnamed: 0,comment,rating,type
7942,A bit rule-intense and overwhelming for beginn...,,test
7982,Fun party game but loses points for extreme ch...,,test
1394,"I like the game , and I like the fuel market m...",8.0,train
8533,"I really like <UNK> , but can see how one migh...",,test
9317,Bought by Henry Do n't like the bidding aspect...,,test


In [4]:
import torch 
import numpy as np
from tqdm import tqdm
from transformers import pipeline


def custom_func(x):
    # apply mean averaging
    x = np.array(x)[0]
    return np.mean(x, axis=0)

# select model from https://huggingface.co/models?pipeline_tag=feature-extraction&library=transformers&sort=downloads
pipe = pipeline('feature-extraction', model="BAAI/bge-m3", device= 0)

# convert 5 element just for test
string = train_and_test["comment"].head(5).values.tolist()

embedding_string = [custom_func(x) for x in tqdm(pipe(string))]
dataset = pd.DataFrame(embedding_string)
dataset

Device set to use mps:0
100%|██████████| 5/5 [00:00<00:00, 286.25it/s]


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,-0.282132,-0.538673,-0.931741,0.610421,0.367078,-0.318613,0.908671,0.209497,-0.004999,0.514849,...,-0.707038,0.177922,0.200121,0.569938,-0.772845,-0.837287,-0.300984,0.010135,-0.282968,-0.498781
1,0.576543,0.269578,-0.404093,0.335713,-0.038606,-0.169334,0.368196,0.636455,-0.14516,-0.164351,...,0.000116,0.154771,0.622342,0.448574,0.038962,0.002933,0.570183,-0.017711,-0.367127,-0.136866
2,-0.642782,-0.276796,-1.154986,0.6481,0.038852,-0.440362,0.580749,0.387414,0.209031,0.163476,...,-0.185547,0.331623,-0.082589,0.483651,-0.683969,-0.411244,0.506481,0.003312,-0.802425,-0.529536
3,-0.31296,0.307666,-1.132566,0.206477,-0.139065,-0.523639,0.527207,0.247038,-0.218915,0.272797,...,-0.051153,0.683492,0.978417,0.606247,-0.173544,-0.033573,0.698721,0.460546,-0.843795,-0.427755
4,0.464596,0.159915,-1.144469,0.204878,0.268719,-0.624845,0.848963,0.147841,-0.545972,0.147352,...,-0.274912,0.066996,0.709244,0.822171,-0.974031,-0.461581,-0.553948,-0.546887,-0.399357,0.122171


In [10]:
from sklearn.linear_model import LinearRegression

reg_model = LinearRegression()

head_5 = train_and_test["rating"].head(5).values.tolist()

reg_model.fit(dataset,head_5)

predict = reg_model.predict(dataset)


In [11]:
mean_squared_error(y_true= head_5, y_pred=predict)


7.888609052210118e-31

Work with the whole date


In [5]:
pipe = pipeline('feature-extraction', model="BAAI/bge-m3", device= 0)

strings = train_and_test["comment"].tolist()

embedding_strings = [custom_func(x) for x in tqdm(pipe(strings))]
emb_dataset = pd.DataFrame(embedding_strings)
emb_dataset.head(5)

Device set to use mps:0
100%|██████████| 10000/10000 [01:42<00:00, 97.88it/s] 


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,-0.282132,-0.538673,-0.931741,0.610421,0.367078,-0.318613,0.908671,0.209497,-0.004999,0.514849,...,-0.707038,0.177922,0.200121,0.569938,-0.772845,-0.837287,-0.300984,0.010135,-0.282968,-0.498781
1,0.576543,0.269578,-0.404093,0.335713,-0.038606,-0.169334,0.368196,0.636455,-0.14516,-0.164351,...,0.000116,0.154771,0.622342,0.448574,0.038962,0.002933,0.570183,-0.017711,-0.367127,-0.136866
2,-0.642782,-0.276796,-1.154986,0.6481,0.038852,-0.440362,0.580749,0.387414,0.209031,0.163476,...,-0.185547,0.331623,-0.082589,0.483651,-0.683969,-0.411244,0.506481,0.003312,-0.802425,-0.529536
3,-0.31296,0.307666,-1.132566,0.206477,-0.139065,-0.523639,0.527207,0.247038,-0.218915,0.272797,...,-0.051153,0.683492,0.978417,0.606247,-0.173544,-0.033573,0.698721,0.460546,-0.843795,-0.427755
4,0.464596,0.159915,-1.144469,0.204878,0.268719,-0.624845,0.848963,0.147841,-0.545972,0.147352,...,-0.274912,0.066996,0.709244,0.822171,-0.974031,-0.461581,-0.553948,-0.546887,-0.399357,0.122171


In [6]:
#merge data
final_data = pd.concat([emb_dataset, train_and_test['rating'],train_and_test['type']], axis = 1)
final_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1016,1017,1018,1019,1020,1021,1022,1023,rating,type
0,-0.282132,-0.538673,-0.931741,0.610421,0.367078,-0.318613,0.908671,0.209497,-0.004999,0.514849,...,0.200121,0.569938,-0.772845,-0.837287,-0.300984,0.010135,-0.282968,-0.498781,7.0,train
1,0.576543,0.269578,-0.404093,0.335713,-0.038606,-0.169334,0.368196,0.636455,-0.145160,-0.164351,...,0.622342,0.448574,0.038962,0.002933,0.570183,-0.017711,-0.367127,-0.136866,7.5,train
2,-0.642782,-0.276796,-1.154986,0.648100,0.038852,-0.440362,0.580749,0.387414,0.209031,0.163476,...,-0.082589,0.483651,-0.683969,-0.411244,0.506481,0.003312,-0.802425,-0.529536,9.5,train
3,-0.312960,0.307666,-1.132566,0.206477,-0.139065,-0.523639,0.527207,0.247038,-0.218915,0.272797,...,0.978417,0.606247,-0.173544,-0.033573,0.698721,0.460546,-0.843795,-0.427755,7.0,train
4,0.464596,0.159915,-1.144469,0.204878,0.268719,-0.624845,0.848963,0.147841,-0.545972,0.147352,...,0.709244,0.822171,-0.974031,-0.461581,-0.553948,-0.546887,-0.399357,0.122171,8.0,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,-0.616821,0.344291,-1.044981,-0.643033,-0.060942,0.155411,1.298204,0.255533,-0.153470,-0.042697,...,0.107105,-0.008455,-0.704824,-0.477130,0.701735,-0.151315,-0.720391,-0.562900,,test
9996,-0.716948,0.131377,-0.834674,0.145579,-0.182448,-0.378106,0.909309,-0.278380,-0.044775,0.449578,...,1.213559,1.178171,-0.327493,-0.686271,0.703568,-0.336423,-0.430428,-1.161135,,test
9997,-0.358043,0.446290,-0.638703,0.160982,0.038369,-0.132108,0.881147,0.277545,-0.687307,0.033222,...,0.531126,0.442368,-0.405425,-0.143813,0.259838,-0.281793,-0.368705,-0.389994,,test
9998,-0.315974,0.477777,-1.584866,0.359272,-0.165661,-0.157165,0.918708,0.058475,-0.098361,-0.308178,...,1.415335,0.615942,-0.714385,-0.123835,0.319438,0.224900,-0.705376,-0.322033,,test


In [7]:
#split the data into train and validation sets
from sklearn.model_selection import train_test_split

train_, valid  = train_test_split(final_data[final_data["type"] == "train"],test_size= 0.5, random_state= 42 )

train_.shape, valid.shape

((2500, 1026), (2500, 1026))

In [8]:
#select the feature vector
X_final = final_data.iloc[:, 0:384]
X_train = final_data[final_data["type"] == "train"].iloc[:, 0:383]
X_train_ = train_.iloc[:, 0:384]
X_valid = valid.iloc[:, 0:384]
X_test = final_data[final_data["type"] == "test"].iloc[:, 0:383]

#select the target vector
y_final = final_data['rating']
y_train = final_data.loc[final_data["type"] == "train",'rating']
y_train_ = train_['rating']
y_valid = valid['rating']
y_test = final_data.loc[final_data["type"] == "test",'rating']

In [9]:
#get a list of train and validation sets
get_X_y = [
    ('train', X_train_, train_),
    ('valid', X_valid, valid)
]

In [10]:
#Define models and hyperparameters
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet

models = {
    "LinearRegression": (LinearRegression(), {}),
    "Ridge": (Ridge(), {"alpha": [1e-3,1e-2,1,5,10,20,30,35,40,45,50,55,100]}),
    "Lasso": (Lasso(), {"alpha": [0.001, 0.01, 0.1, 1.0]}),
    "ElasticNet": (ElasticNet(), {"alpha": [0.001, 0.01, 0.1, 1.0], "l1_ratio": [0.1, 0.5, 0.9]})
}

In [18]:
#gridsearch
from sklearn.model_selection import GridSearchCV

best_model = None
best_score = -np.inf
best_name = None

for name, (model, params) in models.items():
    grid = GridSearchCV(model, params, scoring='neg_mean_squared_error', cv=5)
    grid.fit(X_train_, y_train_)
    print(f"{name}: best MSE = {grid.best_score_:.3f}, params = {grid.best_params_}")

    if grid.best_score_ > best_score:
        best_score = grid.best_score_
        best_model = grid.best_estimator_
        best_name = name

LinearRegression: best MSE = -2.047, params = {}
Ridge: best MSE = -1.764, params = {'alpha': 100}
Lasso: best MSE = -1.755, params = {'alpha': 0.01}
ElasticNet: best MSE = -1.754, params = {'alpha': 0.01, 'l1_ratio': 0.9}


In [12]:
#test on smaller data

name, X_, y_ = get_X_y[0]

#set up the model
reg_best = ElasticNet(alpha = 0.01, l1_ratio = 0.9 )

#fit the model
reg_best.fit(X_, y_['rating'])

#make predictions
for name, X_, y_ in get_X_y:
    pred_best = reg_best.predict(X_)
    pred_best[pred_best<0]=0
    pred_best[pred_best>10]=10
    y_['pred_best']= pred_best

    #print scores
    score_ridge_best = mean_squared_error(y_true= y_['rating'], y_pred=y_['pred_best'])
    print(f'{name}: {score_ridge_best:.4f}')


train: 1.5345
valid: 1.7629


In [13]:
#predict on the entire data

#set up the model
reg_final = ElasticNet(alpha = 0.01, l1_ratio = 0.9 )

#fit the model
reg_final.fit(X_train, y_train )

#make predictions
train_pred = reg_final.predict(X_train)
test_pred = reg_final.predict(X_test)

#eliminate negative predictions
train_pred[train_pred<0] =0
test_pred[test_pred<0] = 0

#eliminate predictions greater than 10
train_pred[train_pred>10] =10
test_pred[test_pred>10] = 10


#add the predictions to the dataset
train_and_test['predict_rating'] = None
train_and_test.loc[train_and_test['type']== 'train','predict_rating'] = train_pred
train_and_test.loc[train_and_test['type']== 'test','predict_rating'] = test_pred

#print scores
score_train_all = calc_metrics(train_and_test, dtypes=['train'])

print(score_train_all)

{'MSE_train': 1.6323016010725466}


# Final checks and prepare submission


In [None]:
if train_and_test.shape[0] != 10000:
    raise ValueError(f'Incorrect train_and_test file shape should be a 10000. {train_and_test.shape[0]} are given')

if "predict_rating" not in train_and_test.columns:
    raise ValueError(f'Column "predict_rating" should be in train_and_test dataset')

if train_and_test["predict_rating"].isnull().sum() > 0:
    raise ValueError(f'Column "predict_rating" have null values')

if (train_and_test["predict_rating"] < 0.).sum() > 0:
    raise ValueError(f'Column "predict_rating" contain negative values')

if (train_and_test["predict_rating"] > 10.).sum() > 0:
    raise ValueError(f'Column "predict_rating" contain values more than 10.')

train_and_test[["predict_rating"]].to_csv(f'2_{NAME}.csv', index=False)