In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abise\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
df = pd.read_csv('My_final_sentiment.csv')
print(df.head())  


       File         AdverseReaction Severity
0  ADCETRIS   Peripheral Neuropathy   Severe
1  ADCETRIS             Anaphylaxis   Severe
2  ADCETRIS      Infusion Reactions   Severe
3  ADCETRIS  Hematologic Toxicities   Severe
4  ADCETRIS              Infections   Severe


In [9]:
# Step 1: Preprocess the text (remove stopwords, lowercase, etc.)
stop_words = set(stopwords.words('english'))
df['AdverseReaction'] = df['AdverseReaction'].apply(lambda x: ' '.join([word.lower() for word in x.split() if word.lower() not in stop_words]))

# Step 2: Encode the labels (Severity) into numerical values
label_encoder = LabelEncoder()
df['Severity_Label'] = label_encoder.fit_transform(df['Severity'])

In [10]:
# Step 3: Split the data into training and test sets
X = df['AdverseReaction']
y = df['Severity_Label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Vectorize the text using TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [11]:
# Step 5: Train a Logistic Regression model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

# Step 6: Evaluate the model
y_pred = model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))

# Step 7: Predict the severity of new reactions
new_reaction = ["Severe headache", "Mild rash"]
new_reaction_tfidf = vectorizer.transform(new_reaction)
predictions = model.predict(new_reaction_tfidf)

# Convert numeric labels back to original severity labels
predicted_severity = label_encoder.inverse_transform(predictions)
print(predicted_severity)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


              precision    recall  f1-score   support

           0       0.62      0.37      0.47     50766
           1       0.62      0.25      0.36     25910
           2       0.63      0.88      0.73     88296

    accuracy                           0.63    164972
   macro avg       0.62      0.50      0.52    164972
weighted avg       0.62      0.63      0.59    164972

['Severe' 'Severe']


In [8]:
df_cleaned = df.drop_duplicates(subset=['File', 'AdverseReaction'])

In [13]:
#with both file and advereser reaction 

In [14]:

# Step 2: Combine 'File' and 'AdverseReaction' into a single text column
df_cleaned['Combined_Feature'] = df_cleaned['File'] + " " + df_cleaned['AdverseReaction']

# Step 3: Preprocess the text (remove stopwords, lowercase, etc.)
stop_words = set(stopwords.words('english'))
df_cleaned['Combined_Feature'] = df_cleaned['Combined_Feature'].apply(lambda x: ' '.join([word.lower() for word in x.split() if word.lower() not in stop_words]))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['Combined_Feature'] = df_cleaned['File'] + " " + df_cleaned['AdverseReaction']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['Combined_Feature'] = df_cleaned['Combined_Feature'].apply(lambda x: ' '.join([word.lower() for word in x.split() if word.lower() not in stop_words]))


In [15]:
# Step 4: Encode the labels (Severity) into numerical values
label_encoder = LabelEncoder()
df_cleaned['Severity_Label'] = label_encoder.fit_transform(df_cleaned['Severity'])

# Step 5: Split the data into training and test sets
X = df_cleaned['Combined_Feature']
y = df_cleaned['Severity_Label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Vectorize the combined text using TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['Severity_Label'] = label_encoder.fit_transform(df_cleaned['Severity'])


In [16]:
# Step 7: Train a Logistic Regression model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

# Step 8: Evaluate the model
y_pred = model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))

# Step 9: Predict the severity of new reactions
new_reaction = ["ADCETRIS Peripheral Neuropathy", "ADCETRIS Mild rash"]
new_reaction_tfidf = vectorizer.transform(new_reaction)
predictions = model.predict(new_reaction_tfidf)

# Convert numeric labels back to original severity labels
predicted_severity = label_encoder.inverse_transform(predictions)
print(predicted_severity)

              precision    recall  f1-score   support

           0       0.83      0.79      0.81     11156
           1       0.83      0.77      0.80      7131
           2       0.86      0.90      0.88     21881

    accuracy                           0.85     40168
   macro avg       0.84      0.82      0.83     40168
weighted avg       0.85      0.85      0.85     40168

['None' 'None']


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [21]:
print(df_cleaned['Severity'].value_counts())

Severe      109449
Moderate     55154
None         36237
Name: Severity, dtype: int64


In [18]:
!pip install imbalanced-learn




In [19]:
#usiong smote in cure the issue of imbalance 


In [20]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Assuming X_train_tfidf and y_train are already defined

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_tfidf, y_train)

# Train the model on the resampled data
model = LogisticRegression(max_iter=1000, class_weight='balanced')  # You can add class_weight if needed
model.fit(X_train_res, y_train_res)

# Evaluate the model on the test set
y_pred = model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.77      0.86      0.81     11156
           1       0.74      0.89      0.80      7131
           2       0.93      0.81      0.87     21881

    accuracy                           0.84     40168
   macro avg       0.81      0.85      0.83     40168
weighted avg       0.85      0.84      0.84     40168



In [22]:
#using balanced method

In [23]:
from sklearn.linear_model import LogisticRegression

# Use class weighting to handle the imbalance
model = LogisticRegression(class_weight='balanced')

# Fit the model on the training data
model.fit(X_train_tfidf, y_train)

# Evaluate the model
y_pred = model.predict(X_test_tfidf)
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.87      0.81     11156
           1       0.72      0.90      0.80      7131
           2       0.94      0.80      0.86     21881

    accuracy                           0.83     40168
   macro avg       0.81      0.85      0.82     40168
weighted avg       0.85      0.83      0.84     40168



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [25]:
#ensemble with xgboost and random forest


In [26]:
!pip install xgboost




In [27]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Train a Random Forest model with class weighting
rf_model = RandomForestClassifier(class_weight='balanced', random_state=42, n_estimators=100, max_depth=10)
rf_model.fit(X_train_tfidf, y_train)

# Evaluate the model on the test set
y_pred_rf = rf_model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred_rf))


              precision    recall  f1-score   support

           0       0.73      0.31      0.43     11156
           1       0.57      0.38      0.46      7131
           2       0.63      0.89      0.74     21881

    accuracy                           0.64     40168
   macro avg       0.64      0.53      0.54     40168
weighted avg       0.65      0.64      0.60     40168



In [28]:
import xgboost as xgb
from sklearn.metrics import classification_report

# Train an XGBoost model with class weighting
xgb_model = xgb.XGBClassifier(scale_pos_weight=3, random_state=42, n_estimators=100, max_depth=10)
xgb_model.fit(X_train_tfidf, y_train)

# Evaluate the model on the test set
y_pred_xgb = xgb_model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred_xgb))


Parameters: { "scale_pos_weight" } are not used.



              precision    recall  f1-score   support

           0       0.96      0.78      0.86     11156
           1       0.97      0.76      0.85      7131
           2       0.85      0.99      0.91     21881

    accuracy                           0.89     40168
   macro avg       0.93      0.84      0.88     40168
weighted avg       0.90      0.89      0.89     40168



In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [6, 8, 10],
    'n_estimators': [100, 200],
    'scale_pos_weight': [1, 2, 3]
}

# Create a XGBoost model
xgb_model = xgb.XGBClassifier(random_state=42)

# Perform GridSearchCV
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train_tfidf, y_train)

# Print the best parameters and score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)


Fitting 3 folds for each of 54 candidates, totalling 162 fits
