In [3]:
import pandas as pd
import re
import nltk

# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from textblob import TextBlob


[nltk_data] Downloading package punkt to C:\Users\Bisol
[nltk_data]     Mathai\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Bisol
[nltk_data]     Mathai\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Bisol
[nltk_data]     Mathai\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:

# Load dataset
# df = pd.read_csv("tweets_50000.csv", 
#                  on_bad_lines='skip', 
#                  encoding='utf-8', 
#                  engine='python')

df = pd.read_csv("tweets_50000.csv")
# Function for text preprocessing
def preprocess_text(text):
    if pd.isna(text):  # Handle NaN values
        return ""
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
    
    # Remove mentions (@user) and hashtags (#hashtag)
    text = re.sub(r"@\w+|#\w+", "", text)
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', '', text)
    
    # Tokenization
    words = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    
    # Stemming
    ps = PorterStemmer()
    words = [ps.stem(word) for word in words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    
    # Spell Correction (Optional, but can be slow)
    text = " ".join(words)
    if text.strip():
        text = str(TextBlob(text).correct())
    
    return text

# Apply preprocessing to the 'text' column
df["cleaned_text"] = df["text"].astype(str).apply(preprocess_text)

# Save the cleaned dataset
df.to_csv("Cleaned_Book1.csv", index=False)

# Display the first 5 rows of the cleaned data
print(df[["text", "cleaned_text"]].head())


KeyboardInterrupt: 

In [None]:
import numpy as np 

# Load dataset
df = pd.read_csv("Cleaned_Book1.csv")  # Replace with your actual dataset file

# Feature 2: Follower-Friend Ratio
df['follower_friend_ratio'] = df['user_followers'] / df['user_friends'].replace({0: np.nan})

# Feature 3: Tweet Length
#df['tweet_length'] = df['cleaned_text'].apply(len)

# Feature 4: Hashtag Count
df['hashtag_count'] = df['hashtags'].apply(lambda x: len(x.split()) if pd.notnull(x) else 0)

# Feature 5: Mention Count
df['mention_count'] = df['text'].apply(lambda x: len(re.findall(r'@\w+', str(x))))

# Feature 6: URL Count
df['url_count'] = df['text'].apply(lambda x: len(re.findall(r'http[s]?://\S+', str(x))))

# Save the processed dataset to a new CSV file
df.to_csv("processed_dataset.csv", index=False)

print("Feature extraction completed. Processed dataset saved as 'processed_dataset.csv'.")