In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
# Load train and test datasets
train_data = pd.read_excel('train.xlsx')
test_data = pd.read_excel('test.xlsx')



In [3]:
train_data.head()

Unnamed: 0,T1,T2,T3,T4,T5,T6,T7,T8,T9,T10,T11,T12,T13,T14,T15,T16,T17,T18,target
0,-70,-61,-66,-53,-51,-63,-82,-57,-76,-78,-66,-66,-61,-59,-73,-75,-63,-77,B37
1,-77,-74,-71,-76,-65,-63,-66,-52,-55,-75,-72,-75,-74,-61,-64,-63,-53,-63,B61
2,-53,-38,-55,-66,-62,-62,-65,-70,-62,-52,-56,-53,-66,-68,-72,-60,-68,-77,A19
3,-72,-62,-59,-65,-65,-65,-78,-82,-83,-59,-84,-60,-64,-83,-69,-72,-95,-73,A22
4,-67,-69,-65,-63,-59,-53,-70,-72,-71,-60,-61,-57,-54,-76,-61,-66,-71,-80,A33


In [4]:
# Identify features and target variable in train dataset
X_train = train_data.drop(columns=['target'])
y_train = train_data['target']


In [5]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_train,y_train,test_size=0.2, random_state=42)


In [6]:
# Initialize classifiers
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
lr_classifier = LogisticRegression(max_iter=1000)
svc_classifier = SVC()

In [7]:
# Train classifiers
rf_classifier.fit(X_train, y_train)


In [8]:
from sklearn.preprocessing import StandardScaler

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

lr_classifier.fit(X_train_scaled, y_train)


In [9]:
svc_classifier.fit(X_train, y_train)


In [10]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Evaluate classifiers
classifiers = {'Random Forest': rf_classifier, 'Logistic Regression': lr_classifier, 'Support Vector Classifier': svc_classifier}
results = {}

for name, classifier in classifiers.items():
    try:
        if name == 'Random Forest':
            classifier.fit(X_train, y_train)  # Ensure RandomForestClassifier is fitted
            y_pred = classifier.predict(X_test_scaled)
        elif name == 'Logistic Regression':
            classifier.fit(X_train, y_train)
            y_pred = classifier.predict(X_test_scaled)  # Use scaled data for LR
        else:
            classifier.fit(X_train, y_train)  # Fit SVC classifier
            y_pred = classifier.predict(X_test)
        
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted')  # Choose an appropriate average parameter
        recall = recall_score(y_test, y_pred, average='weighted')  # Choose an appropriate average parameter
        f1 = f1_score(y_test, y_pred, average='weighted')  # Choose an appropriate average parameter
        results[name] = {'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1 Score': f1}
    
    except Exception as e:
        print(f"An error occurred while evaluating classifier {name}: {e}")


  _warn_prf(average, modifier, msg_start, len(result))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
# Compare results
results_df = pd.DataFrame(results).transpose()
print(results_df)

                           Accuracy  Precision    Recall  F1 Score
Random Forest              0.005441   0.000030  0.005441  0.000059
Logistic Regression        0.502789   0.546350  0.502789  0.456723
Support Vector Classifier  0.982451   0.983425  0.982451  0.982420


In [12]:
# Select the best classifier based on your criteria (e.g., highest accuracy)
best_classifier_name = results_df['Accuracy'].idxmax()
best_classifier = classifiers[best_classifier_name]

In [13]:
# Train the best classifier on the entire dataset
best_classifier.fit(X_train, y_train)

SVC achieves the highest accuracy, precision, recall, and F1 score among the evaluated classifiers, indicating superior overall performance on the test dataset.

In [14]:
# Make predictions using the trained SVC classifier
predictions = best_classifier.predict(X_test)
predictions

array(['B62', 'B55', 'A52', ..., 'A2', 'B2', 'A70'], dtype=object)

In [15]:
# Share target values for the test data
# Assuming X_test is a DataFrame and y_pred is the predicted target values
# Share target values for the test data
test_target_values = pd.DataFrame({'Predicted_Target': y_pred})
test_target_values.to_excel('predicted_target_values.xlsx', index=False)



In [16]:
# Calculate and print train accuracy
train_accuracy = accuracy_score(y_train, best_classifier.predict(X_train))
print("Train Accuracy:", train_accuracy)

Train Accuracy: 0.9894221284990307
