In [131]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer,FunctionTransformer
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import joblib

In [132]:

# Define transformations
transformers = [
    ("lux", Pipeline([
        ("scaler", MinMaxScaler()),
        ("transform", PowerTransformer(method="yeo-johnson"))
            ]), ["lux"]),
    ("gox", Pipeline([
        ("scaler", StandardScaler()),
        ("transform",PowerTransformer(method="yeo-johnson"))
    ]), ["gox"]),
    ("foo", MinMaxScaler(), ["foo"]),
    ("xgt", StandardScaler(), ["xgt"]),
    ("qgg", StandardScaler(), ["qgg"]),
    ("bar", Pipeline([
        ("scaler", StandardScaler()),
        ("transform", FunctionTransformer(np.log1p,feature_names_out='one-to-one'))
    ]), ["bar"]),
    ("hrt", Pipeline([
        ("scaler", MinMaxScaler()),
        ("transform", FunctionTransformer(np.log1p,feature_names_out='one-to-one'))
    ]), ["hrt"]),
    ("juu", MinMaxScaler(), ["juu"]),
    ("yyz",Pipeline([
        ("scaler", StandardScaler()),
        ("transform", PowerTransformer(method="yeo-johnson"))
            ]), ["yyz"]),
    ("drt", Pipeline([
        ("scaler", MinMaxScaler()),
        ("transform", PowerTransformer(method="yeo-johnson"))
            ]), ["drt"])
]

# ColumnTransformer replaces each column rather than adding new columns
column_transformer = ColumnTransformer(
    transformers=transformers,
    remainder="drop",  # Keep untouched columns in their original position
    verbose_feature_names_out=False  # Use original column names
)

# Build the pipeline
pipeline = Pipeline(steps=[
    ("preprocessing", column_transformer)
])

In [133]:
training_path = "/Users/cipri/Downloads/DataScience:MLTest/training.csv"
data = pd.read_csv(training_path)

In [134]:
data["date"]=pd.to_datetime(data["date"])
data["year"] = data["date"].dt.year

In [135]:
numerical_features = ['drt','bar','xgt', 'qgg', 'lux', 'yyz', 'gox', 'foo','hrt', 'juu']
cat_features = ['fyt', 'lgh','year']
target_feature = ["target"]
selected_features = numerical_features + cat_features

In [136]:
pp = pipeline.fit(data[numerical_features])

In [137]:
matrix_pp = pp.transform(data)
data_pp = pd.DataFrame(matrix_pp, columns=numerical_features)
data_pp[cat_features] = data[cat_features]
data_pp["target"] = data["target"].tolist()

In [138]:
joblib.dump(pp, "pipeline.pkl")

['pipeline.pkl']

In [139]:
train_df, test_df = train_test_split(data_pp, test_size=0.1, random_state=42)

regressor = RandomForestRegressor(random_state=43)

X_train = train_df[selected_features]
y_train = train_df["target"].tolist()

X_eval = test_df[selected_features]
y_eval = test_df["target"].tolist()

regressor.fit(X_train, y_train)

y_pred = regressor.predict(X_eval)
mse = mean_squared_error(y_eval, y_pred)  # Mean Squared Error
print("RMSE",np.sqrt(mse))

RMSE 42707.24148149509


In [140]:
joblib.dump(regressor, "trained_model.pkl")

['trained_model.pkl']

Adding predictions

In [141]:
testing_path = "/Users/cipri/Downloads/DataScience:MLTest/test.csv"
df_test = pd.read_csv(testing_path)

In [142]:
df_test["date"]=pd.to_datetime(df_test["date"])
df_test["year"] = df_test["date"].dt.year

In [143]:
numerical_features = ['drt','bar','xgt', 'qgg', 'lux', 'yyz', 'gox', 'foo','hrt', 'juu']
cat_features = ['fyt', 'lgh','year']
selected_features = numerical_features + cat_features

In [144]:
matrix_pp = pp.transform(df_test[numerical_features])
df_test_pp = pd.DataFrame(matrix_pp, columns=numerical_features)
df_test_pp[cat_features] = df_test[cat_features]

In [145]:
df_test["predicted target"]= regressor.predict(df_test_p[selected_features]).flatten()

In [146]:
df_test.to_csv("/Users/cipri/Downloads/DataScience:MLTest/test_with_predictions.csv", index=False)

In [147]:
pd.read_csv("/Users/cipri/Downloads/DataScience:MLTest/test_with_predictions.csv")

Unnamed: 0,image,date,bar,baz,xgt,qgg,lux,wsg,yyz,drt,gox,foo,boz,fyt,lgh,hrt,juu,year,predicted target
0,test-img/boat/boat_3.jpg,2020-11-18,495.361007,0,331.966668,21.021535,5190.710171,0.015209,-12.344110,-4.587213,-11.226993,-7.996372,5.512998,1,0,-239.178795,245.806207,2020,-44043.338870
1,test-img/person/person_53.jpg,2019-01-20,127.415879,1,396.851851,5.630238,4767.268227,0.010817,5.499534,-25.675941,3.239795,23.326502,3.817169,0,1,-192.272913,126.256960,2019,21474.404028
2,test-img/boat/boat_62.jpg,2019-07-05,-48.353010,0,414.096801,-0.406343,4452.181336,0.010609,-0.746068,-15.213459,-0.518122,38.568789,3.019702,0,0,-113.590865,121.513203,2019,59752.713931
3,test-img/car/car_7.jpg,2020-12-15,53.163824,0,365.537458,15.611367,5864.571557,0.013082,-17.029956,-38.933399,14.905780,43.321533,0.935854,1,0,-27.423755,183.329267,2020,-121840.841875
4,test-img/cat/cat_2.jpg,2020-03-20,-32.591336,0,263.053669,-5.938351,4983.384140,0.015802,-7.017901,-7.066405,-25.428807,37.850267,0.213952,1,1,-192.477269,264.677689,2020,29921.839089
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102,test-img/cat/cat_22.jpg,2018-08-25,635.905103,1,265.568956,13.205972,6218.386139,0.009958,-22.508451,-3476.106033,6.652591,44.497852,113.737014,0,0,34.609875,107.259544,2018,-123511.466464
103,test-img/llama/llama_39.jpg,2020-04-30,-49.033202,0,-132.867432,-0.852550,4546.338200,0.007076,0.408917,-1.384199,-0.282366,47.206921,3.178050,1,0,-327.279878,54.518519,2020,54585.202144
104,test-img/airplane/airplane_21.jpg,2020-12-08,226.581519,0,-116.669635,-18.865954,4791.772277,0.016145,8.229320,-3.479010,18.144738,18.764662,1.456052,0,0,-140.698962,275.884990,2020,20541.775686
105,test-img/dog/dog_80.jpg,2021-02-06,135.592952,0,66.279837,18.033609,5074.465347,0.018193,-15.472254,-109.737508,13.744837,13.214807,0.493897,0,0,-339.585373,347.019496,2021,2034.513570
