In [1]:
# Importing the Libraries
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [2]:
# Initialize Lemmatizer and stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [3]:
# Load the dataset
df = pd.read_csv('/content/tweets.csv', encoding='latin-1')

In [4]:
# Display dataset shape and first five rows
print(df.shape)
print(df.head())

(2399, 5)
   id keyword        location  \
0   0  ablaze             NaN   
1   1  ablaze             NaN   
2   2  ablaze   New York City   
3   3  ablaze  Morgantown, WV   
4   4  ablaze             NaN   

                                                text  target  
0  Communal violence in Bhainsa, Telangana. "Ston...       1  
1  Telangana: Section 144 has been imposed in Bha...       1  
2  Arsonist sets cars ablaze at dealership https:...       1  
3  Arsonist sets cars ablaze at dealership https:...       1  
4  "Lord Jesus, your love brings freedom and pard...       0  


In [5]:
# Preprocessing function
def preprocess_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    tokens = word_tokenize(text)  # Tokenize
    tokens = [word for word in tokens if word not in stop_words]  # Remove stop words
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatize
    return ' '.join(tokens)

In [6]:
# Apply preprocessing
df['cleaned_text'] = df['text'].apply(preprocess_text)

In [7]:
# Display cleaned data (first five observations)
for i in range(5):
    original_text = df.loc[i, 'text']
    cleaned_text = preprocess_text(original_text)
    print(f"Original Text {i+1}: {original_text}")
    print(f"Cleaned Text {i+1}: {cleaned_text}\n")

Original Text 1: Communal violence in Bhainsa, Telangana. "Stones were pelted on Muslims' houses and some houses and vehicles were set ablaze
Cleaned Text 1: communal violence bhainsa telangana stone pelted muslim house house vehicle set ablaze

Original Text 2: Telangana: Section 144 has been imposed in Bhainsa from January 13 to 15, after clash erupted between two groups on January 12. Po
Cleaned Text 2: telangana section imposed bhainsa january clash erupted two group january po

Original Text 3: Arsonist sets cars ablaze at dealership https://t.co/gOQvyJbpVI
Cleaned Text 3: arsonist set car ablaze dealership

Original Text 4: Arsonist sets cars ablaze at dealership https://t.co/0gL7NUCPlb https://t.co/u1CcBhOWh9
Cleaned Text 4: arsonist set car ablaze dealership

Original Text 5: "Lord Jesus, your love brings freedom and pardon. Fill me with your Holy Spirit and set my heart ablaze with your l https://t.co/VlTznnPNi8
Cleaned Text 5: lord jesus love brings freedom pardon fill hol

In [8]:
# Save cleaned data
df.to_csv('Clean_data.csv', index=False)

## **Model Training**

In [9]:
# Import necessary libraries
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from gensim.models import Word2Vec

In [10]:
# Load the cleaned dataset
df = pd.read_csv('Clean_data.csv')

In [11]:
# Handle missing values in 'cleaned_text' column (replace with empty string)
df['cleaned_text'] = df['cleaned_text'].fillna('')

In [12]:
# Step 1: Convert text data into numerical representations (TF-IDF)
tfidf = TfidfVectorizer(max_features=1000)
X_tfidf = tfidf.fit_transform(df['cleaned_text'])

In [13]:
# Step 2: Explore and integrate word embedding techniques (Word2Vec)
# Tokenize the text data for Word2Vec
tokenized_text = df['cleaned_text'].apply(lambda x: x.split())

In [14]:
# Train Word2Vec model
w2v_model = Word2Vec(sentences=tokenized_text, vector_size=100, window=5, min_count=1, workers=4)
word_vectors = w2v_model.wv

In [15]:
# Function to get average Word2Vec embeddings for a document
def get_avg_word2vec(tokens, model, vector_size):
    embeddings = [model[word] for word in tokens if word in model]
    if len(embeddings) == 0:
        return np.zeros(vector_size)
    return np.mean(embeddings, axis=0)

In [16]:
# Get Word2Vec embeddings for all documents
X_w2v = np.array([get_avg_word2vec(tokens, word_vectors, 100) for tokens in tokenized_text])

In [17]:
# Combine TF-IDF and Word2Vec features
X_combined = np.hstack((X_tfidf.toarray(), X_w2v))

In [18]:
# Step 3: Select appropriate machine learning algorithm (Random Forest)
y = df['target']

In [19]:
# Step 4: Implement model training pipeline including data splitting and hyperparameter tuning
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

In [20]:
# Define the Random Forest model
rf_model = RandomForestClassifier(random_state=42)

In [21]:
# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}
grid_search = GridSearchCV(rf_model, param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)

In [22]:
# Get the best model
best_rf_model = grid_search.best_estimator_

In [23]:
# Train the best model on the entire training set
best_rf_model.fit(X_train, y_train)

In [24]:
# Step 5: Develop mechanisms for model evaluation using relevant metrics
y_pred = best_rf_model.predict(X_test)

In [25]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Random Forest Algorithm:")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Random Forest Algorithm:
Accuracy: 0.8708333333333333
Precision: 0.8885508849557522
Recall: 0.8708333333333333
F1 Score: 0.8416552598735859

Classification Report:
               precision    recall  f1-score   support

           0       0.86      1.00      0.93       390
           1       1.00      0.31      0.47        90

    accuracy                           0.87       480
   macro avg       0.93      0.66      0.70       480
weighted avg       0.89      0.87      0.84       480



In [26]:
# Save the trained model and vectorizer
import joblib
joblib.dump(best_rf_model, 'best_rf_model.pkl')
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')
joblib.dump(w2v_model, 'w2v_model.pkl')

['w2v_model.pkl']

## User Interface

In [27]:
!pip install streamlit -q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/8.7 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━[0m [32m4.8/8.7 MB[0m [31m144.3 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m8.6/8.7 MB[0m [31m148.9 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m8.6/8.7 MB[0m [31m148.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.7/8.7 MB[0m [31m74.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/207.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m84.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━

In [28]:
!wget -q -O - ipv4.icanhazip.com

34.150.198.116


In [None]:
!streamlit run UI.py & npx localtunnel --port 8501


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.150.198.116:8501[0m
[0m
[1G[0JNeed to install the following packages:
  localtunnel@2.0.2
Ok to proceed? (y) [20G
([100;90m⠂⠂⠂⠂⠂⠂⠂⠂⠂⠂⠂⠂⠂⠂⠂⠂⠂⠂[0m) ⠙ idealTree:75ac80b86e83d4a2: [7msill[0m [35midealTree[0m buildDeps[0m[K
([107;97m#########[0m[100;90m⠂⠂⠂⠂⠂⠂⠂⠂⠂[0m) ⠦ idealTree:75ac80b86e83d4a2: [32;40mtiming[0m [35midealTree:#root[0m Compl[0m[K
[K[?25hyour url is: https://lazy-days-mate.loca.lt
2024-08-01 09:42:16.407 `label` got an empty value. This is discouraged for accessibility reasons and may be disallowed in the future by raising an exception. Please provide a non-empty label and hide it with label_visibility if needed.
2024-08-01 09:42:16.409 `label