In [1]:
import pandas as pd

data = pd.read_csv(r'C:\Users\arman\python\projects\NLP\Data\twitter_training.csv', encoding='ISO-8859-1', header=None)
data.columns=['TweetID','entity','sentiment','message']

print(data.head())
messages=data["message"]
messages = messages.fillna('')
messages = messages.astype(str)


   TweetID       entity sentiment  \
0     2401  Borderlands  Positive   
1     2401  Borderlands  Positive   
2     2401  Borderlands  Positive   
3     2401  Borderlands  Positive   
4     2401  Borderlands  Positive   

                                             message  
0  im getting on borderlands and i will murder yo...  
1  I am coming to the borders and I will kill you...  
2  im getting on borderlands and i will kill you ...  
3  im coming on borderlands and i will murder you...  
4  im getting on borderlands 2 and i will murder ...  


In [2]:
# print(data.info)
# print(data['sentiment'].value_counts())
# print(messages.isnull().sum())
# print(messages.dtype)
data['message_length'] = messages.apply(len) 
print(data.head())
print(messages.describe())

   TweetID       entity sentiment  \
0     2401  Borderlands  Positive   
1     2401  Borderlands  Positive   
2     2401  Borderlands  Positive   
3     2401  Borderlands  Positive   
4     2401  Borderlands  Positive   

                                             message  message_length  
0  im getting on borderlands and i will murder yo...              53  
1  I am coming to the borders and I will kill you...              51  
2  im getting on borderlands and i will kill you ...              50  
3  im coming on borderlands and i will murder you...              51  
4  im getting on borderlands 2 and i will murder ...              57  
count     74682
unique    69492
top            
freq        686
Name: message, dtype: object


In [3]:
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

# Initialize the lemmatizer and stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_tweet(tweet):
    # Remove URLs, mentions, hashtags, and special characters
    tweet = re.sub(r'http\S+|www\S+|@\S+|#\S+|[^A-Za-z0-9\s]+', '', tweet)
    # Convert to lowercase
    tweet = tweet.lower()
    # Tokenize and remove stopwords
    words = [word for word in tweet.split() if word not in stop_words]
    # Lemmatize the words
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

# Apply the preprocessing function to the tweet column
data['cleaned_tweet'] = messages.apply(preprocess_tweet)

# Check the cleaned tweets
print(data['cleaned_tweet'].head())


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\arman\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\arman\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


0      im getting borderland murder
1                coming border kill
2        im getting borderland kill
3       im coming borderland murder
4    im getting borderland 2 murder
Name: cleaned_tweet, dtype: object


In [4]:
validation_data = pd.read_csv(r'C:\Users\arman\python\projects\NLP\Data\twitter_validation.csv', encoding='ISO-8859-1', header = None)
validation_data.columns= ['TweetID','entity','sentiment','message']

#validation_data.head()




validation_data['cleaned_tweet'] = validation_data['message'].apply(preprocess_tweet)


In [5]:
""" 
this is the part that we build a model and convert the texts into features.
"""
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features = 5000)
X_train = vectorizer.fit_transform(data['cleaned_tweet'])
X_val = vectorizer.transform(validation_data['cleaned_tweet'])
print(f'Training data shape: {X_train.shape}')
print(f'Validation data shape: {X_val.shape}')

Training data shape: (74682, 5000)
Validation data shape: (1000, 5000)


In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

model = LogisticRegression(max_iter = 1000)
model.fit(X_train, data['sentiment'])
y_pred = model.predict(X_val)


In [7]:
accuracy = accuracy_score(validation_data['sentiment'], y_pred)
print(f'Validation Accuracy: {accuracy * 100:.2f}%')
print(classification_report(validation_data['sentiment'], y_pred, target_names=['Negative', 'Neutral', 'Positive', 'Irrelevant']))


Validation Accuracy: 80.00%
              precision    recall  f1-score   support

    Negative       0.78      0.70      0.74       172
     Neutral       0.77      0.88      0.82       266
    Positive       0.86      0.74      0.79       285
  Irrelevant       0.79      0.84      0.82       277

    accuracy                           0.80      1000
   macro avg       0.80      0.79      0.79      1000
weighted avg       0.80      0.80      0.80      1000



In [8]:
from sklearn.model_selection import GridSearchCV

# param_grid = {'C': [0.01, 0.1, 1, 10, 100], 'max_iter': [1000, 2000, 3000]}
# grid = GridSearchCV(LogisticRegression(), param_grid, cv=5, scoring='f1_weighted')
# grid.fit(X_train, data['sentiment'])
# print("Best parameters:", grid.best_params_)

# # Use the best estimator for predictions
# best_model = grid.best_estimator_
# y_pred_best = best_model.predict(X_val)

In [9]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Define a pipeline that scales the data and then applies Logistic Regression
pipeline = Pipeline([
    ('scaler', StandardScaler(with_mean=False)),  # with_mean=False because TF-IDF produces sparse data
    ('log_reg', LogisticRegression())
])

# Set up the hyperparameter grid
param_grid = {
    'log_reg__C': [0.01, 0.1, 1, 10, 100],        # Regularization strength
    'log_reg__max_iter': [5000],      # Number of iterations for convergence
    'log_reg__solver': [ 'saga']          # Try alternative solvers
}

# Initialize GridSearchCV with the pipeline, parameter grid, and cross-validation settings
grid = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1_weighted')

# Fit the grid search on the training data
grid.fit(X_train, data['sentiment'])

# Print the best parameters found by GridSearchCV
print("Best parameters found: ", grid.best_params_)

# Use the best model from grid search for predictions
best_model = grid.best_estimator_
y_pred = best_model.predict(X_val)

# Evaluate the performance of the best model on validation data
from sklearn.metrics import accuracy_score, classification_report

accuracy = accuracy_score(validation_data['sentiment'], y_pred)
print(f'Validation Accuracy: {accuracy * 100:.2f}%')
print(classification_report(validation_data['sentiment'], y_pred, target_names=['Negative', 'Neutral', 'Positive', 'Irrelevant']))


Best parameters found:  {'log_reg__C': 0.01, 'log_reg__max_iter': 5000, 'log_reg__solver': 'saga'}
Validation Accuracy: 85.20%
              precision    recall  f1-score   support

    Negative       0.82      0.87      0.84       172
     Neutral       0.82      0.90      0.86       266
    Positive       0.92      0.79      0.85       285
  Irrelevant       0.85      0.87      0.86       277

    accuracy                           0.85      1000
   macro avg       0.85      0.85      0.85      1000
weighted avg       0.86      0.85      0.85      1000



In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)

# Train the model on the training data
rf_model.fit(X_train, data['sentiment'])

# Make predictions on the validation data
y_pred_rf = rf_model.predict(X_val)

# Evaluate the model
accuracy_rf = accuracy_score(validation_data['sentiment'], y_pred_rf)
print(f'Random Forest Validation Accuracy: {accuracy_rf * 100:.2f}%')
print(classification_report(validation_data['sentiment'], y_pred_rf, target_names=['Negative', 'Neutral', 'Positive', 'Irrelevant']))


Random Forest Validation Accuracy: 45.70%
              precision    recall  f1-score   support

    Negative       1.00      0.03      0.06       172
     Neutral       0.37      0.94      0.53       266
    Positive       0.88      0.16      0.27       285
  Irrelevant       0.58      0.57      0.58       277

    accuracy                           0.46      1000
   macro avg       0.71      0.42      0.36      1000
weighted avg       0.68      0.46      0.39      1000



In [13]:
%pip install xgboost

Defaulting to user installation because normal site-packages is not writeable
Collecting xgboost
  Downloading xgboost-2.1.2-py3-none-win_amd64.whl (124.9 MB)
Installing collected packages: xgboost
Successfully installed xgboost-2.1.2


You should consider upgrading via the 'C:\Program Files\Python310\python.exe -m pip install --upgrade pip' command.


Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.

Collecting xgboost
  Using cached xgboost-2.1.2-py3-none-win_amd64.whl (124.9 MB)
Installing collected packages: xgboost
Successfully installed xgboost-2.1.2


You should consider upgrading via the 'c:\Program Files\Python310\python.exe -m pip install --upgrade pip' command.


In [19]:
# Define the mapping for sentiment labels
label_mapping = {
    'Negative': 0,
    'Neutral': 1,
    'Positive': 2,
    'Irrelevant': 3
}

# Apply the mapping to both training and validation datasets
data['sentiment_numeric'] = data['sentiment'].map(label_mapping)
validation_data['sentiment_numeric'] = validation_data['sentiment'].map(label_mapping)


In [20]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize the XGBoost model
xgb_model = XGBClassifier(n_estimators=100, max_depth=10, use_label_encoder=False, eval_metric='mlogloss', random_state=42)

# Train the model on the training data using numeric labels
xgb_model.fit(X_train, data['sentiment_numeric'])

# Make predictions on the validation data
y_pred_xgb = xgb_model.predict(X_val)

# Evaluate the model
accuracy_xgb = accuracy_score(validation_data['sentiment_numeric'], y_pred_xgb)
print(f'XGBoost Validation Accuracy: {accuracy_xgb * 100:.2f}%')
print(classification_report(validation_data['sentiment_numeric'], y_pred_xgb, target_names=['Negative', 'Neutral', 'Positive', 'Irrelevant']))


Parameters: { "use_label_encoder" } are not used.



XGBoost Validation Accuracy: 83.70%
              precision    recall  f1-score   support

    Negative       0.75      0.92      0.83       266
     Neutral       0.90      0.77      0.83       285
    Positive       0.83      0.88      0.85       277
  Irrelevant       0.94      0.76      0.84       172

    accuracy                           0.84      1000
   macro avg       0.86      0.83      0.84      1000
weighted avg       0.85      0.84      0.84      1000



In [23]:
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier

param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [6, 8, 10],
    'learning_rate': [0.01, 0.1, 0.2]
}

# Initialize RandomizedSearchCV with a limited number of iterations
random_search = RandomizedSearchCV(
    XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42),
    param_distributions=param_dist,
    n_iter=5,  # Limits the number of parameter combinations to try
    cv=3,
    scoring='accuracy',
    random_state=42
)

# Fit the model
random_search.fit(X_train, data['sentiment_numeric'])
print("Best parameters:", random_search.best_params_)



Parameters: { "use_label_encoder" } are not used.



KeyboardInterrupt: 

In [24]:
%pip install xgboost --extra-index-url https://pypi.nvidia.com


Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://pypi.nvidia.com


You should consider upgrading via the 'c:\Program Files\Python310\python.exe -m pip install --upgrade pip' command.





In [26]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize the XGBoost model with GPU support
xgb_model = XGBClassifier(
    n_estimators=300,         # Number of trees, you can adjust for experimentation
    max_depth=10,             # Maximum depth of each tree
    learning_rate=0.1,        # Step size
    tree_method='gpu_hist',   # Enables GPU support
    use_label_encoder=False,  # Avoids unnecessary warnings in recent XGBoost versions
    eval_metric='mlogloss',   # Multi-class logarithmic loss for evaluation
    random_state=42
)

# Train the model with early stopping
xgb_model.fit(
    X_train, data['sentiment_numeric'],
    eval_set=[(X_val, validation_data['sentiment_numeric'])],  # Validation data for early stopping
    early_stopping_rounds=10,  # Stops if no improvement after 10 rounds
    verbose=True  # Prints progress and performance metrics during training
)

# Make predictions on the validation set
y_pred_xgb = xgb_model.predict(X_val)

# Evaluate the model
accuracy_xgb = accuracy_score(validation_data['sentiment_numeric'], y_pred_xgb)
print(f'XGBoost Validation Accuracy: {accuracy_xgb * 100:.2f}%')
print(classification_report(validation_data['sentiment_numeric'], y_pred_xgb, target_names=['Negative', 'Neutral', 'Positive', 'Irrelevant']))


TypeError: XGBClassifier.fit() got an unexpected keyword argument 'early_stopping_rounds'

In [29]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report
dtrain = xgb.DMatrix(X_train, label=data['sentiment_numeric'])
dval = xgb.DMatrix(X_val, label=validation_data['sentiment_numeric'])

# Set up XGBoost parameters
params = {
    'objective': 'multi:softmax',  # Multi-class classification
    'num_class': 4,                # Number of classes in the target variable
    'max_depth': 10,
    'learning_rate': 0.1,
    'tree_method': 'gpu_hist',     # Use GPU for training
    'eval_metric': 'mlogloss'      # Evaluation metric
}

# Train the model with early stopping
bst = xgb.train(
    params, 
    dtrain, 
    num_boost_round=300,           # Maximum number of boosting rounds
    evals=[(dval, 'validation')],  # Validation set for evaluation
    early_stopping_rounds=10,      # Stop if no improvement after 10 rounds
    verbose_eval=True              # Print progress during training
)

# Make predictions on the validation data
y_pred_xgb = bst.predict(dval)

# Evaluate the model
accuracy_xgb = accuracy_score(validation_data['sentiment_numeric'], y_pred_xgb)
print(f'XGBoost Validation Accuracy: {accuracy_xgb * 100:.2f}%')
print(classification_report(validation_data['sentiment_numeric'], y_pred_xgb, target_names=['Negative', 'Neutral', 'Positive', 'Irrelevant']))


    E.g. tree_method = "hist", device = "cuda"



XGBoostError: [07:02:11] C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0ed59c031377d09b8-1\xgboost\xgboost-ci-windows\src\tree\updater_gpu_hist.cu:861: Exception in gpu_hist: [07:02:11] C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0ed59c031377d09b8-1\xgboost\xgboost-ci-windows\src\tree\updater_gpu_hist.cu:867: Check failed: ctx_->Ordinal() >= 0 (-1 vs. 0) : Must have at least one device


In [30]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

# Convert training and validation sets to DMatrix
dtrain = xgb.DMatrix(X_train, label=data['sentiment_numeric'])
dval = xgb.DMatrix(X_val, label=validation_data['sentiment_numeric'])

# Set up XGBoost parameters for CPU training
params = {
    'objective': 'multi:softmax',  # Multi-class classification
    'num_class': 4,                # Number of classes in the target variable
    'max_depth': 10,
    'learning_rate': 0.1,
    'tree_method': 'hist',         # Use CPU for training
    'eval_metric': 'mlogloss'      # Evaluation metric
}

# Train the model with early stopping
bst = xgb.train(
    params, 
    dtrain, 
    num_boost_round=300,           # Maximum number of boosting rounds
    evals=[(dval, 'validation')],  # Validation set for evaluation
    early_stopping_rounds=10,      # Stop if no improvement after 10 rounds
    verbose_eval=True              # Print progress during training
)

# Make predictions on the validation data
y_pred_xgb = bst.predict(dval)

# Evaluate the model
accuracy_xgb = accuracy_score(validation_data['sentiment_numeric'], y_pred_xgb)
print(f'XGBoost Validation Accuracy: {accuracy_xgb * 100:.2f}%')
print(classification_report(validation_data['sentiment_numeric'], y_pred_xgb, target_names=['Negative', 'Neutral', 'Positive', 'Irrelevant']))


[0]	validation-mlogloss:1.36165
[1]	validation-mlogloss:1.33937
[2]	validation-mlogloss:1.32007
[3]	validation-mlogloss:1.30362
[4]	validation-mlogloss:1.28841
[5]	validation-mlogloss:1.27422
[6]	validation-mlogloss:1.26240
[7]	validation-mlogloss:1.25083
[8]	validation-mlogloss:1.24018
[9]	validation-mlogloss:1.23025
[10]	validation-mlogloss:1.22223
[11]	validation-mlogloss:1.21148
[12]	validation-mlogloss:1.20220
[13]	validation-mlogloss:1.19444
[14]	validation-mlogloss:1.18688
[15]	validation-mlogloss:1.17877
[16]	validation-mlogloss:1.17029
[17]	validation-mlogloss:1.16369
[18]	validation-mlogloss:1.15744
[19]	validation-mlogloss:1.15101
[20]	validation-mlogloss:1.14490
[21]	validation-mlogloss:1.13904
[22]	validation-mlogloss:1.13128
[23]	validation-mlogloss:1.12506
[24]	validation-mlogloss:1.11966
[25]	validation-mlogloss:1.11433
[26]	validation-mlogloss:1.10841
[27]	validation-mlogloss:1.10264
[28]	validation-mlogloss:1.09825
[29]	validation-mlogloss:1.09294
[30]	validation-mlog