In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
import joblib
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as sklearn_stopwords
from textblob import TextBlob
import re

# Load the dataset
df = pd.read_csv('IADC_CODE.csv')

# Check for missing values in the dataframe
print("Missing values in each column before handling:")
print(df.isnull().sum())

# Data Preprocessing
# Fill missing values in 'comments' with a placeholder
df['comments'].fillna('missing', inplace=True)

# Ensure the 'comments' column is of string type
df['comments'] = df['comments'].astype(str)

# Handling missing values in target column 'Sub code'
df['Sub code'].fillna('UNKNOWN', inplace=True)

# Verify no more missing values in 'comments' or 'Sub code'
print("Missing values in each column after handling:")
print(df.isnull().sum())

# Clean and preprocess text data
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove stopwords
    words = text.split()
    words = [word for word in words if word not in sklearn_stopwords]
    return ' '.join(words)

df['comments'] = df['comments'].apply(preprocess_text)

# Remove classes with fewer than 2 instances
class_counts = df['Sub code'].value_counts()
classes_to_keep = class_counts[class_counts >= 2].index
df = df[df['Sub code'].isin(classes_to_keep)]

# Verify the distribution of classes after removal
print("Class distribution after removing rare classes:")
print(df['Sub code'].value_counts())

# Split the dataset into features (X) and target (y)
X = df['comments']
y = df['Sub code']

# Check for any unexpected data types or values
print(X.head())
print(y.head())
print(X.dtype)
print(y.dtype)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Feature Engineering and Model Selection in a Pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2))),
    ('clf', RandomForestClassifier())
])

# Define parameter grid for GridSearchCV
param_grid = {
    'tfidf__max_df': [0.5, 0.75, 1.0],
    'clf__n_estimators': [100, 200, 300],
    'clf__max_depth': [10, 20, None],
    'clf__min_samples_split': [2, 5],
    'clf__min_samples_leaf': [1, 2]
}

# Use GridSearchCV with error_score='raise' to debug
grid_search = GridSearchCV(pipeline, param_grid, cv=3, n_jobs=-1, verbose=2, error_score='raise')
try:
    grid_search.fit(X_train, y_train)
except Exception as e:
    print(f"An error occurred: {e}")

# Check if grid search was successful
if hasattr(grid_search, 'best_estimator_'):
    # Model Evaluation
    y_pred = grid_search.best_estimator_.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy: {accuracy}')
    print('Classification Report:')
    print(classification_report(y_test, y_pred))

    # Save the best model and vectorizer for future use
    joblib.dump(grid_search.best_estimator_, 'iadc_code_model.pkl')
    print("Model saved successfully.")
else:
    print("Grid search failed to find a suitable model.")


Missing values in each column before handling:
Borehole           1
Start              1
End                1
Time Type          2
Phase           1010
Code               2
Sub code           2
Start depth       75
End depth         75
Hole size          6
section name       6
comments           2
dtype: int64
Missing values in each column after handling:
Borehole           1
Start              1
End                1
Time Type          2
Phase           1010
Code               2
Sub code           0
Start depth       75
End depth         75
Hole size          6
section name       6
comments           0
dtype: int64
Class distribution after removing rare classes:
Sub code
6A         357
5A         122
11A         93
2A          62
21L         61
22B         54
14A         43
13A         31
12B         21
12A         19
1A          19
15A         17
12C         16
21A         16
8A          14
8B           8
22D          7
21B          7
23B          6
9A           6
3A           4
22A  

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['comments'].fillna('missing', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Sub code'].fillna('UNKNOWN', inplace=True)


Accuracy: 0.7164179104477612
Classification Report:
              precision    recall  f1-score   support

         11A       1.00      0.42      0.59        19
         12A       0.67      0.50      0.57         4
         12B       0.50      0.25      0.33         4
         12C       0.00      0.00      0.00         3
         13A       0.60      0.50      0.55         6
         14A       0.75      0.67      0.71         9
         15A       1.00      0.33      0.50         3
         15B       0.00      0.00      0.00         1
          1A       1.00      1.00      1.00         4
         21A       0.75      1.00      0.86         3
         21B       1.00      1.00      1.00         1
         21E       0.00      0.00      0.00         1
         21L       1.00      0.92      0.96        12
         22A       0.00      0.00      0.00         1
         22B       0.60      0.27      0.38        11
         22D       0.00      0.00      0.00         1
         22J       0.00      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [16]:

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
import joblib
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as sklearn_stopwords
import re

# Load the dataset
df = pd.read_csv('IADC_CODE.csv')

# Check for missing values in the dataframe
print("Missing values in each column before handling:")
print(df.isnull().sum())

# Data Preprocessing
# Fill missing values in 'comments' with a placeholder
df['comments'] = df['comments'].fillna('missing')

# Ensure the 'comments' column is of string type
df['comments'] = df['comments'].astype(str)

# Handling missing values in target column 'Sub code'
df['Sub code'] = df['Sub code'].fillna('UNKNOWN')

# Verify no more missing values in 'comments' or 'Sub code'
print("Missing values in each column after handling:")
print(df.isnull().sum())

# Clean and preprocess text data
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove stopwords
    words = text.split()
    words = [word for word in words if word not in sklearn_stopwords]
    return ' '.join(words)

df['comments'] = df['comments'].apply(preprocess_text)

# Remove classes with fewer than 2 instances
class_counts = df['Sub code'].value_counts()
classes_to_keep = class_counts[class_counts >= 2].index
df = df[df['Sub code'].isin(classes_to_keep)]

# Verify the distribution of classes after removal
print("Class distribution after removing rare classes:")
print(df['Sub code'].value_counts())

# Split the dataset into features (X) and target (y)
X = df['comments']
y = df['Sub code']

# Check for any unexpected data types or values
print(X.head())
print(y.head())
print(X.dtype)
print(y.dtype)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Feature Engineering and Model Selection in a Pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2))),
    ('clf', GradientBoostingClassifier())
])

# Define parameter grid for GridSearchCV
param_grid = {
    'tfidf__max_df': [0.5, 0.75],
    'tfidf__min_df': [1, 2],
    'clf__n_estimators': [100, 200],
    'clf__learning_rate': [0.1, 0.01],
    'clf__max_depth': [3, 5]
}

# Use StratifiedKFold for better cross-validation
cv = StratifiedKFold(n_splits=3)

# Use GridSearchCV with error_score='raise' to debug
grid_search = GridSearchCV(pipeline, param_grid, cv=cv, n_jobs=-1, verbose=2, error_score='raise')

try:
    grid_search.fit(X_train, y_train)
except Exception as e:
    print(f"An error occurred: {e}")

# Check if grid search was successful
if hasattr(grid_search, 'best_estimator_'):
    # Model Evaluation
    y_pred = grid_search.best_estimator_.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy: {accuracy}')
    print('Classification Report:')
    print(classification_report(y_test, y_pred))

    # Save the best model and vectorizer for future use
    joblib.dump(grid_search.best_estimator_, 'iadc_code_model.pkl')
    print("Model saved successfully.")
else:
    print("Grid search failed to find a suitable model.")


Missing values in each column before handling:
Borehole           1
Start              1
End                1
Time Type          2
Phase           1010
Code               2
Sub code           2
Start depth       75
End depth         75
Hole size          6
section name       6
comments           2
dtype: int64
Missing values in each column after handling:
Borehole           1
Start              1
End                1
Time Type          2
Phase           1010
Code               2
Sub code           0
Start depth       75
End depth         75
Hole size          6
section name       6
comments           0
dtype: int64
Class distribution after removing rare classes:
Sub code
6A         357
5A         122
11A         93
2A          62
21L         61
22B         54
14A         43
13A         31
12B         21
12A         19
1A          19
15A         17
12C         16
21A         16
8A          14
8B           8
22D          7
21B          7
23B          6
9A           6
3A           4
22A  



Accuracy: 0.7164179104477612
Classification Report:
              precision    recall  f1-score   support

         11A       0.92      0.58      0.71        19
         12A       0.50      0.50      0.50         4
         12B       0.67      0.50      0.57         4
         12C       0.00      0.00      0.00         3
         13A       0.75      0.50      0.60         6
         14A       0.80      0.44      0.57         9
         15A       1.00      0.33      0.50         3
         15B       0.00      0.00      0.00         1
          1A       1.00      0.75      0.86         4
         21A       1.00      1.00      1.00         3
         21B       1.00      1.00      1.00         1
         21E       0.00      0.00      0.00         1
         21L       1.00      0.92      0.96        12
         22A       0.00      0.00      0.00         1
         22B       0.67      0.36      0.47        11
         22D       0.00      0.00      0.00         1
         22J       0.00      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 2-3: truncated \UXXXXXXXX escape (853780538.py, line 12)

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset with the correct encoding
file_path = r'C:\Users\KIIT\Desktop\AI\DPR.csv'  # Use raw string to avoid unicode escape issues
data = pd.read_csv(file_path, encoding='ISO-8859-1')

# Display the first few rows of the dataset to understand its structure
print(data.head())

# Print the column names to verify them
print("Column names in the dataset:", data.columns)

# Column names based on the provided structure
comments_column = 'comments'
iadc_code_column = 'Sub code'

# Check if the columns exist in the dataset
if comments_column not in data.columns or iadc_code_column not in data.columns:
    raise KeyError(f"One or both columns '{comments_column}' and '{iadc_code_column}' do not exist in the dataset")

# Preprocess the data
# Drop rows with missing values in the columns of interest
data = data.dropna(subset=[comments_column, iadc_code_column])

# Extract the comments and IADC codes
comments = data[comments_column].astype(str).values
iadc_codes = data[iadc_code_column].astype(str).values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(comments, iadc_codes, test_size=0.2, random_state=42)

# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Define the model
model = LogisticRegression()

# Define the grid of hyperparameters
param_grid = {
    'C': [0.1, 1, 10, 100],  # Regularization strength
    'penalty': ['l2', 'none'],  # Regularization type
    'max_iter': [100, 200, 300, 500]  # Maximum number of iterations
}

# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', verbose=1)
grid_search.fit(X_train_tfidf, y_train)

# Print the best parameters and the best score
print(f'Best Parameters: {grid_search.best_params_}')
print(f'Best Cross-Validation Accuracy: {grid_search.best_score_}')

# Use the best model to predict on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_tfidf)

# Evaluate the best model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Test Set Accuracy: {accuracy}')
print('Classification Report:')
print(report)


  data = pd.read_csv(file_path, encoding='ISO-8859-1')


  Borehole                 Start                  End Time Type  Phase Code   \
0  SB_14P-5  24-09-2023 16:00:00  24-09-2023 18:00:00        PT    NaN  RDRP   
1  SB_14P-5  24-09-2023 18:00:00  24-09-2023 20:30:00        PT    NaN  DAUP   
2  SB_14P-5  24-09-2023 20:30:00  25-09-2023 02:00:00        PT    NaN  DAUP   
3  SB_14P-5  25-09-2023 02:00:00  25-09-2023 04:00:00        PT    NaN  DAUP   
4  SB_14P-5  25-09-2023 04:00:00  25-09-2023 06:00:00        PT    NaN  DAUP   

  Sub code  Start depth  End depth  Hole size  ... Unnamed: 16374  \
0       1D       3170.0     3170.0        6.0  ...            NaN   
1      14A       3170.0     3170.0        6.0  ...            NaN   
2      14A       3170.0     3170.0        6.0  ...            NaN   
3      14A       3170.0     3170.0        6.0  ...            NaN   
4      11A       3170.0     3170.0        6.0  ...            NaN   

  Unnamed: 16375  Unnamed: 16376  Unnamed: 16377  Unnamed: 16378  \
0            NaN             NaN    

80 fits failed out of a total of 160.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
80 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\KIIT\Desktop\AI\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\KIIT\Desktop\AI\.venv\Lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "c:\Users\KIIT\Desktop\AI\.venv\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\KIIT\Desktop\AI\.venv\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
 

Best Parameters: {'C': 100, 'max_iter': 100, 'penalty': 'l2'}
Best Cross-Validation Accuracy: 0.7963470319634702
Test Set Accuracy: 0.8102189781021898
Classification Report:
              precision    recall  f1-score   support

         11A       0.90      0.70      0.79        27
         12A       1.00      0.91      0.95        11
         12B       0.50      0.67      0.57         3
         12C       1.00      0.56      0.71         9
         13A       0.83      1.00      0.91         5
         14A       0.67      0.86      0.75        14
         15A       0.67      0.40      0.50         5
         15B       0.00      0.00      0.00         4
         17A       0.00      0.00      0.00         1
          1A       0.67      1.00      0.80         2
          1B       1.00      1.00      1.00         2
          1C       0.00      0.00      0.00         1
          1D       0.83      0.83      0.83         6
         21A       1.00      0.50      0.67         4
         21B   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
