# Problem Statement-
Thermophysical Property: Melting Point <br>
Goal:The goal is to build ML models that predict melting point (°C) for organic compounds given molecular descriptors.

## Exploratory Data Analysis :

In [36]:
# importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

import warnings
warnings.filterwarnings("ignore")

In [37]:
df=pd.read_csv("train.csv")

In [38]:
df

Unnamed: 0,id,SMILES,Tm,Group 1,Group 2,Group 3,Group 4,Group 5,Group 6,Group 7,...,Group 415,Group 416,Group 417,Group 418,Group 419,Group 420,Group 421,Group 422,Group 423,Group 424
0,2175,FC1=C(F)C(F)(F)C1(F)F,213.15,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1222,c1ccc2c(c1)ccc3Nc4ccccc4c23,407.15,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2994,CCN1C(C)=Nc2ccccc12,324.15,2,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1704,CC#CC(=O)O,351.15,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2526,CCCCC(S)C,126.15,2,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2657,707,ClCCBr,256.45,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2658,1573,N#CC(Cl)(Cl)Cl,231.15,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2659,1634,Cc1ccc2c(C)cccc2c1,256.25,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2660,3022,CCC(=O)c1ccc2ccccc2c1,333.15,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [39]:
zero_count_per_col = (df == 0).sum()
print(zero_count_per_col)

id              0
SMILES          0
Tm              0
Group 1      1251
Group 2      1739
             ... 
Group 420    2662
Group 421    2662
Group 422    2662
Group 423    2662
Group 424    2662
Length: 427, dtype: int64


In [40]:
df.shape

(2662, 427)

In [41]:

list1=[]
for i in df.columns:
    zero_count_per_col = (df[i] == 0).sum()
    if zero_count_per_col>=2662:
        list1.append(i)
        df.drop(i,axis=1,inplace=True)
print("Columns removed with no data",len(list1))

Columns removed with no data 87


In [42]:
df.shape

(2662, 340)

In [43]:
df.head()

Unnamed: 0,id,SMILES,Tm,Group 1,Group 2,Group 3,Group 4,Group 5,Group 6,Group 7,...,Group 406,Group 407,Group 408,Group 409,Group 410,Group 412,Group 414,Group 415,Group 416,Group 418
0,2175,FC1=C(F)C(F)(F)C1(F)F,213.15,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1222,c1ccc2c(c1)ccc3Nc4ccccc4c23,407.15,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,2994,CCN1C(C)=Nc2ccccc12,324.15,2,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1704,CC#CC(=O)O,351.15,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2526,CCCCC(S)C,126.15,2,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [44]:
df["SMILES"].nunique()

2662

In [45]:
df.drop("SMILES",axis=1,inplace=True)

In [46]:
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
from sklearn.feature_selection import VarianceThreshold

x = df.drop(columns=["Tm"])   # change target column
y = df["Tm"]

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [47]:
from sklearn.ensemble import RandomForestRegressor
rf=RandomForestRegressor()
rf.fit(x_train,y_train)

In [48]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestRegressor
import pandas as pd

# 1. Apply variance threshold
vt = VarianceThreshold(threshold=0.01)
x_train_vt = vt.fit_transform(x_train)
x_test_vt = vt.transform(x_test)   # renamed for clarity

selected_cols = x_train.columns[vt.get_support()]
x_train_vt = pd.DataFrame(x_train_vt, columns=selected_cols)
x_test_vt = pd.DataFrame(x_test_vt, columns=selected_cols)

# 2. Train Random Forest on filtered features
rf_vt = RandomForestRegressor(random_state=42)
rf_vt.fit(x_train_vt, y_train)

# 3. Feature importance
feat_imp = pd.Series(
    rf_vt.feature_importances_,
    index=x_train_vt.columns
).sort_values(ascending=False)

print(feat_imp.head(20))

Group 15     0.187478
id           0.172246
Group 2      0.129380
Group 31     0.047916
Group 1      0.037383
Group 41     0.026084
Group 123    0.022182
Group 29     0.019821
Group 180    0.019012
Group 20     0.014048
Group 21     0.013291
Group 373    0.012469
Group 129    0.012254
Group 168    0.012214
Group 412    0.011469
Group 16     0.011443
Group 175    0.010725
Group 170    0.009842
Group 5      0.009065
Group 30     0.008737
dtype: float64


In [49]:
imp_features = ["Group 15", "id", "Group 2", "Group 31", "Group 1", "Group 41", "Group 123", "Group 29", "Group 180", "Group 20","Tm"]
for i in df.columns:
    if i not in imp_features:
        df.drop(i,axis=1,inplace=True)
        

In [50]:
print("Imp features :",df.columns)

Imp features : Index(['id', 'Tm', 'Group 1', 'Group 2', 'Group 15', 'Group 20', 'Group 29',
       'Group 31', 'Group 41', 'Group 123', 'Group 180'],
      dtype='object')


In [52]:
from sklearn.model_selection import train_test_split
x = df.drop(columns=["Tm"])   # change target column
y = df["Tm"]

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [65]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
import joblib

# Train on FULL original dataset (like your MAE 36.88)
lr = LinearRegression()
lr.fit(x_train, y_train)  # x_train = ALL your original features

y_pred = lr.predict(x_test)
print(f"MAE: {mean_absolute_error(y_test, y_pred):.2f}")
print(f"R2: {r2_score(y_test, y_pred):.3f}")

joblib.dump(lr, 'melting_point_champion.pkl')

# Predict test_result with matching features
y_pred_test = lr.predict(test_result[x_train.columns])
test_result['Predicted_Melting_Point'] = y_pred_test
test_result.to_csv('predictions.csv', index=False)


MAE: 54.06
R2: 0.304
