In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score


In [9]:

train_data = pd.read_excel('train.xlsx')
print("Training Data:")
print(train_data.head())

test_data = pd.read_excel('test.xlsx')
print("Test Data:")
print(test_data.head())


Training Data:
   T1  T2  T3  T4  T5  T6  T7  T8  T9  T10  T11  T12  T13  T14  T15  T16  T17  \
0 -70 -61 -66 -53 -51 -63 -82 -57 -76  -78  -66  -66  -61  -59  -73  -75  -63   
1 -77 -74 -71 -76 -65 -63 -66 -52 -55  -75  -72  -75  -74  -61  -64  -63  -53   
2 -53 -38 -55 -66 -62 -62 -65 -70 -62  -52  -56  -53  -66  -68  -72  -60  -68   
3 -72 -62 -59 -65 -65 -65 -78 -82 -83  -59  -84  -60  -64  -83  -69  -72  -95   
4 -67 -69 -65 -63 -59 -53 -70 -72 -71  -60  -61  -57  -54  -76  -61  -66  -71   

   T18 target  
0  -77    B37  
1  -63    B61  
2  -77    A19  
3  -73    A22  
4  -80    A33  
Test Data:
   T1  T2  T3  T4  T5  T6  T7  T8  T9  T10  T11  T12  T13  T14  T15  T16  T17  \
0 -76 -83 -70 -66 -64 -72 -64 -69 -60  -76  -83  -78  -81  -81  -81  -70  -60   
1 -58 -57 -78 -81 -73 -73 -78 -78 -82  -49  -55  -58  -66  -79  -72  -83  -74   
2 -70 -70 -71 -69 -69 -68 -61 -55 -53  -82  -87  -76  -68  -57  -64  -75  -57   
3 -71 -61 -56 -56 -61 -60 -68 -66 -72  -58  -55  -56  -58  -62  -61

In [10]:

# features and target variable
X_train = train_data.drop(columns=['target'])
y_train = train_data['target']

X_test = test_data[X_train.columns]


In [11]:
#preprocessing using pipeline
categorical_cols = X_train.select_dtypes(include=['object']).columns
numerical_cols = X_train.select_dtypes(include=['number']).columns

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])


In [12]:

X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# logistic regression classifier
classifier = LogisticRegression(random_state=42)
classifier.fit(X_train_preprocessed, y_train)

y_train_pred = classifier.predict(X_train_preprocessed)

#  classifier
print("Training Data Classification Report:")
print(classification_report(y_train, y_train_pred))
print("Training Data Accuracy Score:", accuracy_score(y_train, y_train_pred))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Training Data Classification Report:
              precision    recall  f1-score   support

          A1       0.93      0.91      0.92       215
         A10       0.83      0.80      0.82       204
         A11       0.95      1.00      0.97       212
         A12       1.00      1.00      1.00       203
         A13       1.00      1.00      1.00       219
         A14       0.99      1.00      1.00       418
         A15       0.99      0.95      0.97       413
         A16       1.00      1.00      1.00       210
         A17       0.97      0.93      0.95       204
         A18       0.99      1.00      1.00       189
         A19       0.99      1.00      0.99       208
          A2       1.00      1.00      1.00       204
         A20       0.99      1.00      1.00       205
         A21       0.89      0.93      0.91       411
         A22       1.00      1.00      1.00       210
         A23       1.00      1.00      1.00       202
         A24       1.00      1.00      1.00 

In [14]:

y_test_pred = classifier.predict(X_test_preprocessed)

test_data['Predicted Target'] = y_test_pred

print("Test Data with Predictions:")
print(test_data.head())


Test Data with Predictions:
   T1  T2  T3  T4  T5  T6  T7  T8  T9  T10  T11  T12  T13  T14  T15  T16  T17  \
0 -76 -83 -70 -66 -64 -72 -64 -69 -60  -76  -83  -78  -81  -81  -81  -70  -60   
1 -58 -57 -78 -81 -73 -73 -78 -78 -82  -49  -55  -58  -66  -79  -72  -83  -74   
2 -70 -70 -71 -69 -69 -68 -61 -55 -53  -82  -87  -76  -68  -57  -64  -75  -57   
3 -71 -61 -56 -56 -61 -60 -68 -66 -72  -58  -55  -56  -58  -62  -61  -59  -64   
4 -72 -71 -64 -69 -64 -63 -61 -42 -55  -61  -69  -67  -63  -63  -55  -49  -49   

   T18 Predicted Target  
0  -60              B74  
1  -80              A10  
2  -70              B65  
3  -65              B20  
4  -57              A67  
