In [1]:
import pandas as pd
import numpy as np
import spacy
import re
import joblib
from wordcloud import WordCloud
from tqdm import tqdm

# Load the dataset into a Pandas DataFrame

df = pd.read_csv('labeled_data.csv')

df.info()

# Remove any rows with missing values

# Remove any rows with missing values
df.dropna(inplace=True)

# Function to expand contractions

def expand_contractions(text):
    contraction_patterns = [(r'can\'t', 'cannot'),
                            (r'won\'t', 'will not'),
                            # Add more patterns as needed
                           ]
    for pattern, replacement in contraction_patterns:
        text = re.sub(pattern, replacement, text)
    return text

# Function to remove URLs and mentions

def remove_urls_mentions(text):
    text = re.sub(r'http\S+|www\S+|@\S+', '', text)
    return text

# Function to remove special characters

def remove_special_characters(text):
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text

# Define a function to preprocess a single text

def preprocess_text(text):
    # Load the spaCy English language model
    nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser'])
    
    # Convert the text to lowercase
    text = text.lower()
    
    # Expand contractions
    text = expand_contractions(text)
    
    # Remove URLs and mentions
    text = remove_urls_mentions(text)
    
    # Remove special characters
    text = remove_special_characters(text)
    
    # Tokenize the text using spaCy
    doc = nlp(text)
    
    # Lemmatize the tokens
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    
    # Join the tokens back into a single string
    clean_text = ' '.join(tokens)
    
    return clean_text

# Use joblib to parallelize the preprocessing step across multiple CPU cores

preprocessed_tweets = joblib.Parallel(n_jobs=-1)(
    joblib.delayed(preprocess_text)(text) for text in tqdm(df['tweet']))

# Add the preprocessed tweets back to the DataFrame

df['clean_tweet'] = preprocessed_tweets


df.to_csv('processed_text.csv', index=False)




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24783 entries, 0 to 24782
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Unnamed: 0          24783 non-null  int64 
 1   count               24783 non-null  int64 
 2   hate_speech         24783 non-null  int64 
 3   offensive_language  24783 non-null  int64 
 4   neither             24783 non-null  int64 
 5   class               24783 non-null  int64 
 6   tweet               24783 non-null  object
dtypes: int64(6), object(1)
memory usage: 1.3+ MB


100%|██████████████████████████████████████████████████████████████████████████| 24783/24783 [1:00:06<00:00,  6.87it/s]
