In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset with the correct encoding
file_path = r'C:\Users\arnav\OneDrive\Desktop\ONGC\DPR.csv'  # Use raw string to avoid unicode escape issues
data = pd.read_csv(file_path, encoding='ISO-8859-1')

# Display the first few rows of the dataset to understand its structure
print(data.head())

# Print the column names to verify them
print("Column names in the dataset:", data.columns)

# Column names based on the provided structure
comments_column = 'comments'
iadc_code_column = 'Sub code'

# Check if the columns exist in the dataset
if comments_column not in data.columns or iadc_code_column not in data.columns:
    raise KeyError(f"One or both columns '{comments_column}' and '{iadc_code_column}' do not exist in the dataset")

# Preprocess the data
# Drop rows with missing values in the columns of interest
data = data.dropna(subset=[comments_column, iadc_code_column])

# Extract the comments and IADC codes
comments = data[comments_column].astype(str).values
iadc_codes = data[iadc_code_column].astype(str).values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(comments, iadc_codes, test_size=0.2, random_state=42)

# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Define the model
model = LogisticRegression()

# Define the grid of hyperparameters
param_grid = {
    'C': [0.1, 1, 10, 100],  # Regularization strength
    'penalty': ['l2', 'none'],  # Regularization type
    'max_iter': [100, 200, 300, 500]  # Maximum number of iterations
}

# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', verbose=1)
grid_search.fit(X_train_tfidf, y_train)

# Print the best parameters and the best score
print(f'Best Parameters: {grid_search.best_params_}')
print(f'Best Cross-Validation Accuracy: {grid_search.best_score_}')

# Use the best model to predict on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_tfidf)

# Evaluate the best model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Test Set Accuracy: {accuracy}')
print('Classification Report:')
print(report)


  data = pd.read_csv(file_path, encoding='ISO-8859-1')


  Borehole                 Start                  End Time Type  Phase Code   \
0  SB_14P-5  24-09-2023 16:00:00  24-09-2023 18:00:00        PT    NaN  RDRP   
1  SB_14P-5  24-09-2023 18:00:00  24-09-2023 20:30:00        PT    NaN  DAUP   
2  SB_14P-5  24-09-2023 20:30:00  25-09-2023 02:00:00        PT    NaN  DAUP   
3  SB_14P-5  25-09-2023 02:00:00  25-09-2023 04:00:00        PT    NaN  DAUP   
4  SB_14P-5  25-09-2023 04:00:00  25-09-2023 06:00:00        PT    NaN  DAUP   

  Sub code  Start depth  End depth  Hole size  ... Unnamed: 16374  \
0       1D       3170.0     3170.0        6.0  ...            NaN   
1      14A       3170.0     3170.0        6.0  ...            NaN   
2      14A       3170.0     3170.0        6.0  ...            NaN   
3      14A       3170.0     3170.0        6.0  ...            NaN   
4      11A       3170.0     3170.0        6.0  ...            NaN   

  Unnamed: 16375  Unnamed: 16376  Unnamed: 16377  Unnamed: 16378  \
0            NaN             NaN    

80 fits failed out of a total of 160.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
80 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\arnav\OneDrive\Desktop\ONGC\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\arnav\OneDrive\Desktop\ONGC\.venv\Lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "c:\Users\arnav\OneDrive\Desktop\ONGC\.venv\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\arnav\OneDrive\Desktop\ONGC\.venv\Lib\site-packages\sklearn\utils\_param_validation.p

Best Parameters: {'C': 100, 'max_iter': 100, 'penalty': 'l2'}
Best Cross-Validation Accuracy: 0.7963470319634702
Test Set Accuracy: 0.8102189781021898
Classification Report:
              precision    recall  f1-score   support

         11A       0.90      0.70      0.79        27
         12A       1.00      0.91      0.95        11
         12B       0.50      0.67      0.57         3
         12C       1.00      0.56      0.71         9
         13A       0.83      1.00      0.91         5
         14A       0.67      0.86      0.75        14
         15A       0.67      0.40      0.50         5
         15B       0.00      0.00      0.00         4
         17A       0.00      0.00      0.00         1
          1A       0.67      1.00      0.80         2
          1B       1.00      1.00      1.00         2
          1C       0.00      0.00      0.00         1
          1D       0.83      0.83      0.83         6
         21A       1.00      0.50      0.67         4
         21B   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
