In [175]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import fbeta_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

In [176]:
train_df = pd.read_csv(r'E:/IITK/P2 data/carvan_train.csv')
test_df = pd.read_csv(r'E:/IITK/P2 data/carvan_test.csv')

In [177]:
# Display first 5 rows of train and test data
print("train_df:")
print(train_df.head())

print("\ntest_df:")
print(test_df.head())

train_df:
   V1  V2  V3  V4  V5  V6  V7  V8  V9  V10  ...  V77  V78  V79  V80  V81  V82  \
0  33   1   3   2   8   0   5   1   3    7  ...    0    0    0    1    0    0   
1  37   1   2   2   8   1   4   1   4    6  ...    0    0    0    1    0    0   
2  37   1   2   2   8   0   4   2   4    3  ...    0    0    0    1    0    0   
3   9   1   3   3   3   2   3   2   4    5  ...    0    0    0    1    0    0   
4  40   1   4   2  10   1   4   1   4    7  ...    0    0    0    1    0    0   

   V83  V84  V85  V86  
0    0    0    0    0  
1    0    0    0    0  
2    0    0    0    0  
3    0    0    0    0  
4    0    0    0    0  

[5 rows x 86 columns]

test_df:
   V1  V2  V3  V4  V5  V6  V7  V8  V9  V10  ...  V76  V77  V78  V79  V80  V81  \
0  33   1   4   2   8   0   6   0   3    5  ...    0    0    0    0    1    0   
1   6   1   3   2   2   0   5   0   4    5  ...    2    0    0    0    1    0   
2  39   1   3   3   9   1   4   2   3    5  ...    1    0    0    0    1    0   
3 

In [178]:
print(test_df.columns.tolist())

['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V29', 'V30', 'V31', 'V32', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V39', 'V40', 'V41', 'V42', 'V43', 'V44', 'V45', 'V46', 'V47', 'V48', 'V49', 'V50', 'V51', 'V52', 'V53', 'V54', 'V55', 'V56', 'V57', 'V58', 'V59', 'V60', 'V61', 'V62', 'V63', 'V64', 'V65', 'V66', 'V67', 'V68', 'V69', 'V70', 'V71', 'V72', 'V73', 'V74', 'V75', 'V76', 'V77', 'V78', 'V79', 'V80', 'V81', 'V82', 'V83', 'V84', 'V85']


In [179]:
print(train_df.columns.tolist())

['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V29', 'V30', 'V31', 'V32', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V39', 'V40', 'V41', 'V42', 'V43', 'V44', 'V45', 'V46', 'V47', 'V48', 'V49', 'V50', 'V51', 'V52', 'V53', 'V54', 'V55', 'V56', 'V57', 'V58', 'V59', 'V60', 'V61', 'V62', 'V63', 'V64', 'V65', 'V66', 'V67', 'V68', 'V69', 'V70', 'V71', 'V72', 'V73', 'V74', 'V75', 'V76', 'V77', 'V78', 'V79', 'V80', 'V81', 'V82', 'V83', 'V84', 'V85', 'V86']


In [180]:
print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

Train shape: (5822, 86)
Test shape: (4000, 85)


In [181]:
X = train_df.iloc[:, :-1]
y = train_df.iloc[:, -1]

In [182]:
# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [183]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(test_df)


In [184]:
from sklearn.linear_model import LogisticRegression

In [185]:
model = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
model.fit(X_train_scaled, y_train)

In [186]:
val_preds = model.predict(X_val_scaled)
fbeta = fbeta_score(y_val, val_preds, beta=2)
print(f"F2 Score on Validation Set: {fbeta:.4f}")

F2 Score on Validation Set: 0.3649


In [187]:
test_preds = model.predict(X_test_scaled)

In [188]:
submission = pd.DataFrame({
    "Id": np.arange(len(test_preds)),
    "Prediction": test_preds
})
submission.to_csv(r'E:/IITK/P2 data/submission.csv', index=False)
print("Submission file 'submission.csv' created successfully.")

Submission file 'submission.csv' created successfully.


In [206]:
print(submission.head())

   Id  Prediction
0   0           0
1   1           1
2   2           1
3   3           0
4   4           0


In [1]:
# 1️⃣ Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import fbeta_score

# 2️⃣ Load datasets
train_df = pd.read_csv(r'E:/IITK/P2 data/carvan_train.csv')
test_df = pd.read_csv(r'E:/IITK/P2 data/carvan_test.csv')
# 3️⃣ Split features and target
X = train_df.drop("V86", axis=1)   # V86 is the target (CARAVAN)
y = train_df["V86"]

# 4️⃣ Train-validation split (stratified to handle imbalance)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# 5️⃣ Standard scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(test_df)

# 6️⃣ Train logistic regression model
model = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
model.fit(X_train_scaled, y_train)

# 7️⃣ Predict on validation set
val_preds = model.predict(X_val_scaled)

# 8️⃣ Compute F-beta score (β=2)
fbeta = fbeta_score(y_val, val_preds, beta=2)
print(f"Validation F-beta score (β=2): {fbeta:.4f}")

# 9️⃣ Predict on test data
test_preds = model.predict(X_test_scaled)

# 10️⃣ Create submission DataFrame
submission = pd.DataFrame({
    "Id": np.arange(len(test_preds)),
    "Prediction": test_preds
})

submission.to_csv(r'E:/IITK/P2 data/submission2.csv', index=False)
print("Submission file 'submission2.csv' created successfully.")


Validation F-beta score (β=2): 0.3649
Submission file 'submission2.csv' created successfully.


In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

train_df = pd.read_csv(r'E:/IITK/P2 data/carvan_train.csv')
test_df = pd.read_csv(r'E:/IITK/P2 data/carvan_test.csv')

# Features and target
X_train = train_df.drop("V86", axis=1)
y_train = train_df["V86"]

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(test_df)

# Train model on full train data
model = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
model.fit(X_train_scaled, y_train)

# Predict on test data
test_preds = model.predict(X_test_scaled)

# Save submission
submission = pd.DataFrame({
    "Prediction": test_preds
})
submission.to_csv(r'E:/IITK/P2 data/submission3.csv', index=False)
print("Submission file 'submission3.csv' created successfully.")


Submission file 'submission3.csv' created successfully.
