In [None]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split


In [None]:
# Load your data into a DataFrame
# Try different encodings
encodings = ['utf-8', 'latin-1', 'iso-8859-1', 'cp1252']

for encoding in encodings:
    try:
        df = pd.read_csv('Path Of Your Data File', encoding=encoding)
        # If reading succeeds, break out of the loop
        break
    except UnicodeDecodeError:
        print(f"Failed to read with encoding {encoding}")

# Now df contains your DataFrame with the successfully decoded data

# Display a few values from the original data
print("Original Data Sample:")
print(df.head())


In [None]:
import pandas as pd
import re

# Specify the column containing text data
message_column = 'Column Name Contain Text Data'

# Check and convert non-string elements to strings
df[message_column] = df[message_column].astype(str)

# Lowercasing and removing special characters
df[message_column] = df[message_column].str.lower()
df[message_column] = df[message_column].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', x))


In [None]:
# Display a few values after preprocessing
print("\nData After Preprocessing:")
print(df.head())


In [None]:
# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Encode the target variable y
y_encoded = label_encoder.fit_transform(df['Column Name Contain Sentiment'])  # Replace with the actual column name
# Display a few values of y (encoded)
print("\nEncoded Target Variable (y):")
print(y_encoded[:5])


In [None]:
# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # Adjust max_features as needed

# Fit and transform the messages
X = tfidf_vectorizer.fit_transform(df[message_column])

# Display a few values of X
print("\nTF-IDF Features (X):")
print(X[:5])


In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)
# Display a few values of X_train
print("\nFew Values Of X_train :")
print(X_train[:5])
# Display a few values of X_test
print("\nFew Values Of X_test :")
print(X_test[:5])
# Display a few values of y_train
print("\nFew Values Of y_train :")
print(y_train[:5])
# Display a few values of y_test
print("\nFew Values Of y_test:")
print(y_test[:5])


In [None]:
# Create and train the logistic regression model
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train, y_train)


In [None]:
from sklearn.metrics import accuracy_score, classification_report

# Make predictions on the test set
y_pred = lr_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# Print classification report
report = classification_report(y_test, y_pred)
print("\nClassification Report:")
print(report)
