In [47]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import joblib


In [48]:
# Load training data
df = pd.read_csv("dataset/train.csv")

In [49]:
print(df.columns)

Index(['filename', 'label'], dtype='object')


In [50]:
# creating simple numeric features from filename length (placeholder)
df['file_length'] = df['filename'].apply(len)



In [51]:
# Separating features and target
X = df[['file_length']]  
y = df['label']



#### Train-test split

In [52]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print(X_train.shape, X_val.shape)

(355, 1) (89, 1)


In [53]:
print(df.head())
print(df.info())

         filename  label  file_length
0  audio_1261.wav    1.0           14
1   audio_942.wav    1.5           13
2  audio_1110.wav    1.5           14
3  audio_1024.wav    1.5           14
4   audio_538.wav    2.0           13
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 444 entries, 0 to 443
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   filename     444 non-null    object 
 1   label        444 non-null    float64
 2   file_length  444 non-null    int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 10.5+ KB
None


### Feature Scaling

In [54]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

## Model Training

In [55]:
model = RandomForestRegressor(
    n_estimators=150,
    max_depth=15,
    random_state=42
)
model.fit(X_train, y_train)


0,1,2
,n_estimators,150
,criterion,'squared_error'
,max_depth,15
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [56]:
# Evaluation on validation set
val_preds = model.predict(X_val)
print(f"MAE: {mean_absolute_error(y_val, val_preds):.3f}")
print(f"R2 Score: {r2_score(y_val, val_preds):.3f}")

MAE: 1.032
R2 Score: 0.065


In [57]:
# Save model and scaler
joblib.dump(model, "grammar_model.joblib")
joblib.dump(scaler, "scaler.joblib")

['scaler.joblib']

### Check

In [58]:
print(df.shape)
print(df.head())


(444, 3)
         filename  label  file_length
0  audio_1261.wav    1.0           14
1   audio_942.wav    1.5           13
2  audio_1110.wav    1.5           14
3  audio_1024.wav    1.5           14
4   audio_538.wav    2.0           13


### Prediction on Test Data

In [59]:
# Load test data
test = pd.read_csv("dataset/test.csv")

In [60]:
test_df['file_length'] = test_df['filename'].apply(len)

In [61]:
# Scale using trained scaler
X_test_final = scaler.transform(test_df[['file_length']])

#### Grammar Scores Prediction

In [62]:
test_preds = model.predict(X_test_final)

In [63]:
# Sanity check
print(test_df.shape)
print(test_df.head())
print(submission.head())

(195, 2)
         filename  file_length
0   audio_706.wav           13
1   audio_800.wav           13
2    audio_68.wav           12
3  audio_1267.wav           14
4   audio_683.wav           13
         filename     label
0   audio_706.wav  3.703399
1   audio_800.wav  3.703399
2    audio_68.wav  3.536998
3  audio_1267.wav  3.439359
4   audio_683.wav  3.703399


### Cross-Validate

In [64]:
# Load sample submission template
sample_sub = pd.read_csv("dataset/sample_submission.csv")

In [65]:
# Add predictions to the 'label' column
sample_sub['label'] = test_preds

In [66]:
# Save final submission file
sample_sub.to_csv("submission.csv", index=False)
print("Submission File is created")

Submission File is created


### Final Check

In [67]:
print(sample_sub.head())

         filename     label
0   audio_706.wav  3.703399
1   audio_800.wav  3.703399
2    audio_68.wav  3.536998
3  audio_1267.wav  3.439359
4   audio_683.wav  3.703399
