In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import xgboost as xgb

In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Read the CSV file from Google Drive
train = pd.read_csv('/content/drive/MyDrive/Portfolio/NLP/train.csv')

In [None]:
# Prepare data
X = train[['text', 'keyword', 'location']]
y = train['target']

In [None]:
# Vectorize features
vectorizer = TfidfVectorizer()
X_vectorized = vectorizer.fit_transform(X['text'].fillna('') + ' ' +
                                       X['keyword'].fillna('') + ' ' +
                                       X['location'].fillna(''))

# Split data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)

In [None]:
# Define XGBoost model
model = xgb.XGBClassifier()

# Define parameter grid
param_grid = {
    'learning_rate': [0.1, 0.2, 0.3],
    'max_depth': [4, 6, 8],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

# Define scoring metrics
scoring = {'accuracy': 'accuracy',
           'precision': 'precision',
           'recall': 'recall',
           'f1': 'f1'}

# Create GridSearchCV object with cross-validation
cv = StratifiedKFold(n_splits=3)
grid_search = GridSearchCV(model, param_grid, cv=cv)

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)


In [None]:

# Get the best parameters
best_params = grid_search.best_params_

# Train the model with the best parameters on the full training set
best_model = xgb.XGBClassifier(**best_params)
best_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Display the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Classification Report:\n", classification_rep)

# Load test data
test = pd.read_csv('/content/drive/MyDrive/Portfolio/NLP/test.csv')

# Vectorize unseen data
test_vectorized = vectorizer.transform(test['text'].fillna('') + ' ' +
                                         test['keyword'].fillna('') + ' ' +
                                         test['location'].fillna(''))

# Make predictions on unseen data
y_pred_test = best_model.predict(test_vectorized)

# Create a DataFrame for the submission file
submission = pd.DataFrame({'id': test['id'], 'target': y_pred_test})

# Save the submission file
submission.to_csv('submission.csv', index=False)


Accuracy: 0.7826657912015759
Precision: 0.806949806949807
Recall: 0.6440677966101694
F1 Score: 0.7163667523564695
Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.89      0.82       874
           1       0.81      0.64      0.72       649

    accuracy                           0.78      1523
   macro avg       0.79      0.76      0.77      1523
weighted avg       0.79      0.78      0.78      1523



In [None]:
# Load test data
test = pd.read_csv('/content/drive/MyDrive/Portfolio/NLP/test.csv')

# Vectorize unseen data
test_vectorized = vectorizer.transform(test['text'].fillna('') + ' ' +
                                         test['keyword'].fillna('') + ' ' +
                                         test['location'].fillna(''))

# Make predictions on unseen data
y_pred_test = best_model.predict(test_vectorized)

# Create a DataFrame for the submission file
submission = pd.DataFrame({'id': test['id'], 'target': y_pred_test})

# Save the submission file
submission.to_csv('submission.csv', index=False)