In [None]:
# Importing the Libraries
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
# Load the dataset
df = pd.read_csv('/content/Tweets.csv', encoding='latin-1')

# Rows and column
print(df.shape)

(27481, 4)


In [None]:
# Initialize Lemmatizer and stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [None]:
# Preprocessing function
def preprocess_text(text):
    # Check if text is a string before applying preprocessing
    if isinstance(text, str):
        text = text.lower()  # Lowercase
        text = re.sub(r'http\S+|www\S+|https\S+', '', text)  # Remove URLs
        text = re.sub(r'@\w+', '', text)  # Remove mentions
        text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
        text = re.sub(r'\d+', '', text)  # Remove numbers
        tokens = word_tokenize(text)  # Tokenize
        tokens = [word for word in tokens if word not in stop_words]  # Remove stop words
        tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatize
        return ' '.join(tokens)
    else:
        # Return empty string or handle NaN values as needed
        return ''
# Apply preprocessing
df['cleaned_text'] = df['text'].apply(preprocess_text)

# Display cleaned data (first five observations)
for i in range(5):
    original_text = df.loc[i, 'text']
    cleaned_text = preprocess_text(original_text)
    print(f"Original Text {i+1}: {original_text}")
    print(f"Cleaned Text {i+1}: {cleaned_text}\n")

# Save cleaned data
df.to_csv('Clean_data.csv', index=False)

Original Text 1:  I`d have responded, if I were going
Cleaned Text 1: id responded going

Original Text 2:  Sooo SAD I will miss you here in San Diego!!!
Cleaned Text 2: sooo sad miss san diego

Original Text 3: my boss is bullying me...
Cleaned Text 3: bos bullying

Original Text 4:  what interview! leave me alone
Cleaned Text 4: interview leave alone

Original Text 5:  Sons of ****, why couldn`t they put them on the releases we already bought
Cleaned Text 5: son couldnt put release already bought



In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

# Load the cleaned dataset
df = pd.read_csv("Clean_data.csv")

# Drop missing values in important columns
df = df.dropna(subset=['cleaned_text', 'sentiment'])

# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['cleaned_text'])

# Encode sentiment labels
y = df['sentiment'].map({'negative': 0, 'neutral': 1, 'positive': 2})

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest model with optimized parameters
model = RandomForestClassifier(n_estimators=50, random_state=42)  # Reduced estimators for speed
model.fit(X_train, y_train)

# Predictions and evaluation
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", report)

# Save the trained model
joblib.dump(model, "sentiment_model.pkl")

# Save the TF-IDF vectorizer
joblib.dump(tfidf, "tfidf_vectorizer.pkl")

print("Model and vectorizer saved successfully!")


Accuracy: 0.7014
Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.62      0.67      1589
           1       0.64      0.72      0.68      2150
           2       0.77      0.75      0.76      1744

    accuracy                           0.70      5483
   macro avg       0.71      0.70      0.70      5483
weighted avg       0.71      0.70      0.70      5483

Model and vectorizer saved successfully!


In [None]:
!pip install streamlit -q
!wget -q -O - ipv4.icanhazip.com

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m68.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m94.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25h35.225.209.177


In [None]:
!streamlit run UI.py & npx localtunnel --port 8501

[1G[0K⠙[1G[0K⠹
Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K[1G[0JNeed to install the following packages:
localtunnel@2.0.2
Ok to proceed? (y) [20G[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://35.225.209.177:8501[0m
[0m
y

[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏
[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼
[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0Kyour url is: https://hungry-ducks-bet.loca.lt
