#### Text Lowercasing:

Convert all text to lowercase to ensure uniformity. This prevents the model from treating "HATE" and "hate" as different words.

#### Removing Special Characters and Punctuation:

Remove punctuation marks, special characters, and symbols, as they don't typically provide meaningful information for hate speech detection.

#### Removing URLs and User Mentions:

Remove URLs and user mentions (e.g., @username) from the text, as they are not relevant to the analysis.

#### Removing Numbers:

Remove numerical digits or replace them with a placeholder token if numbers don't convey important information.

#### Tokenization:

Tokenize the text into words or subword units. Tokenization breaks the text into smaller units, making it easier for the NLP model to process.

#### Removing Stop Words:

Depending on the specific analysis, you may choose to remove common stop words (e.g., "the," "and," "in") to reduce noise. However, be cautious when removing stop words, as they can be relevant in some hate speech contexts.

#### Stemming or Lemmatization (Optional):

Apply stemming or lemmatization to reduce words to their base forms. This can help normalize the text and reduce feature dimensionality. Be mindful of the potential loss of context when applying these techniques.

#### Handling Emojis and Emoticons:

Decide whether to keep or remove emojis and emoticons. Some hate speech may include offensive symbols or characters, so consider their importance to the analysis.
Handling Abbreviations and Acronyms:

Expand or normalize common abbreviations and acronyms to their full forms (e.g., "lol" to "laugh out loud").

#### Spelling Correction (Optional):

Depending on the quality of the data, you may choose to apply spelling correction to address typos and misspellings.

#### Removing or Masking Sensitive Information (Optional):

If the data contains sensitive or personally identifiable information (PII), consider removing or masking it to protect privacy and comply with data regulations.

#### Filtering Non-Textual Content (Optional):

Depending on your analysis, you may need to filter out non-textual content such as images or videos associated with tweets.
**Handling Imbalanced Data (if applicable):

In [None]:
import os
import re
import string
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


import torch
import torch.nn as nn


from spellchecker import SpellChecker
from spellchecker import WordFrequency

from collections import Counter
from textblob import TextBlob

from transformers import AutoTokenizer
from transformers import XLNetTokenizer, XLNetForSequenceClassification, AdamW
from transformers import XLNetForSequenceClassification, XLNetTokenizer

from torch.optim import AdamW
from torch.utils.data import DataLoader, TensorDataset
from torch.utils.data import DataLoader, TensorDataset

from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

# Set the maximum width for column display to a large value (e.g., 200 characters)
pd.set_option('display.max_colwidth', 200)





In [None]:
# Change directory
print(os.getcwd())
os.chdir('C://Users/andre/Job Portfolio Projects/DataGlacierVI/sentiment.analysis/')
print(os.getcwd())

In [None]:
dftt = pd.read_csv('test_tweets.csv')
dftrt = pd.read_csv('train_tweets.csv')

In [None]:
display(dftt.head())
display(dftt.info())
display(dftt.shape)

In [None]:
display(dftrt.head())
display(dftrt.info())
display(dftrt.shape)

In [None]:
# Percentage of original dataset that test data is
dftt.shape[0]/(dftt.shape[0] + dftrt.shape[0])

In [None]:
# Average tweet length

def avg_len_tweet(colname):
    
    # Create empty list
    lst = []
    
    # For loop to iterate through every tweet
    for tweet in dftrt[colname]:
        
        # Get len of every string
        length = len(tweet)
        
        # Append the length to a list
        lst.append(length)
        
    # Sum all elements in list
    len_sum = sum(lst)
    
    # Divide 'len_sum' by length of list
    print('Average tweet length: %d characters' % (len_sum/len(lst)))
    

## Data Cleaning

In [None]:
avg_len_tweet('tweet')

In [None]:

def clean_tweet(tweet):
    
   
    # Convert to Lowercase
    tweet = tweet.lower()
    
    # Remove URLs
    tweet = re.sub(r'http\S+', '', tweet)
    
    # Remove user mentions like: '@username'
    tweet = re.sub(r'@\w+', '', tweet)
    
    # Remove special characters and punctuation (except for spaces)
    tweet = re.sub(r'[^a-zA-Z\s]', '', tweet)
    
    # Remove numbers
    tweet = re.sub(r'[0-9]', '', tweet)
    
    # Remove extra spaces
    tweet = ' '.join(tweet.split())
    
    
    return tweet


In [None]:
# Clean tweets of punctuation, special characters, url's, user mentions, numbers, and extra spaces

dftrt['clean_tweet'] = dftrt['tweet'].apply(clean_tweet)
dftrt.head()

In [None]:
dftt['clean_tweet'] = dftt['tweet'].apply(clean_tweet)
dftt.head()

## Model

In [None]:
import torch
import pandas as pd
import torch.nn as nn
from transformers import XLNetTokenizer, XLNetForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, classification_report
from transformers import XLNetForSequenceClassification, XLNetTokenizer
from torch.optim import AdamW
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split


# Load and preprocess training data
def preprocess_data(csv_file, tokenizer):
    # Load .csv
    df = pd.read_csv(csv_file)
    
    # Clean the tweets of the training set
    df['clean_tweet'] = df['tweet'].apply(clean_tweet) 
    
    # Create an instance of the tokenizer
    tokenizer = tokenizer.from_pretrained("xlnet-base-cased")
    
    # Tokenize the tweets
    tokenized_texts = []

    for text in df['clean_tweet']:
        try:
            # Convert the token IDs to string tokens
            tokens = [str(token_id) for token_id in tokenizer.encode(str(text), add_special_tokens=True, is_split_into_words=True)]
        except Exception as e:
            print(f"Error converting tokens to IDs: {e}")
            print(f"Problematic text: {text}")
            continue  # Skip this text and move on to the next

        tokenized_texts.append(tokens)

    # Set max length of tensors
    max_len = 110  
    input_ids = []
    attention_masks = []

    for tokens in tokenized_texts:
        # Truncate if the sequence is longer than max_len
        if len(tokens) > max_len:
            tokens = tokens[:max_len]
        
        # Tokenize and pad input_ids
        input_id = tokenizer.convert_tokens_to_ids(tokens)  
        input_id += [0] * (max_len - len(input_id))
        input_ids.append(input_id)

        # Create attention mask (1 for real tokens, 0 for padding)
        attention_mask = [1] * len(tokens) + [0] * (max_len - len(tokens))
        attention_masks.append(attention_mask)

    # Convert the lists to PyTorch tensors
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)
    
    if csv_file == 'train_tweets.csv':
        # Create a tensor for target labels
        labels = torch.tensor(df['label'].values, dtype=torch.long)
        return input_ids, attention_masks, labels
    else:
        return input_ids, attention_masks
    
    

# Load and preprocess testing data (similar to training data)
input_ids_train, attention_masks_train, y_train = preprocess_data('train_tweets.csv', XLNetTokenizer)
input_ids_test, attention_masks_test = preprocess_data('test_tweets.csv', XLNetTokenizer)



batch_size = 110

# Create DataLoader for training data
train_data = TensorDataset(input_ids_train, attention_masks_train, y_train)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

# Create DataLoader for test data
test_data = TensorDataset(input_ids_test, attention_masks_test)
test_loader = DataLoader(test_data, batch_size=batch_size)



# Define the XLNet-based model
model = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased", num_labels=2)  

# Define optimizer and learning rate
optimizer = AdamW(model.parameters(), lr=2e-5)



# Training loop
num_epochs = 3
device = torch.device("cpu")
model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids, attention_masks, labels = batch
        input_ids, attention_masks, labels = input_ids.to(device), attention_masks.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}")


model.eval()
y_pred = []
with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_masks = batch
        input_ids, attention_masks = input_ids.to(device), attention_masks.to(device)
        outputs = model(input_ids, attention_mask=attention_masks)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1)
        y_pred.extend(predictions.cpu().numpy())


# model.eval()
# y_test = []  # Replace this with your test labels
# y_pred = []
# with torch.no_grad():
#     for batch in test_loader:
#         input_ids, attention_masks, labels = batch
#         input_ids, attention_masks, labels = input_ids.to(device), attention_masks.to(device), labels.to(device)
#         outputs = model(input_ids, attention_mask=attention_masks)
#         logits = outputs.logits
#         predictions = torch.argmax(logits, dim=1)
#         y_pred.extend(predictions.cpu().numpy())
#         y_test.extend(labels.cpu().numpy())  # Replace this with your test labels

# # Calculate accuracy and print classification report
# accuracy = accuracy_score(y_test, y_pred)
# report = classification_report(y_test, y_pred)
# print(f"Accuracy: {accuracy:.4f}")
# print(report)



In [None]:
tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")
tokenizer

In [None]:
tweets = [
    "This is a sample tweet.",
    "Another tweet to tokenize."
]

tokenized_tweets = [tokenizer.encode(tweet, add_special_tokens=True) for tweet in tweets]


In [None]:
dftrt['label'].unique()

In [None]:
dftrt[dftrt['label'] == 1]

In [None]:
import re
import tweepy
from tweepy import OAuthHandler
from textblob import TextBlob

In [None]:
consumer_key = 'XXXXXXXXXXXXX'
    consumer_secret = 'XXXXXXXXXXXXX'
    access_token = 'XXXXXXXXXXXXXX'
    access_token_secret = 'XXXXXXXXXXXX'

In [None]:
try:
    # Create OAuth object
    self.auth = OAuthHandler(consumer_key, consumer_secret)
    # Set access_token and secret
    self.auth.set_access_token(access_token, access_token_secret)
    # Create tweepy API object to fetch tweets
    self.api = tweepy.API(self.auth)
except:
    print('Error: Authentication Failed...')

In [None]:
tweets = api.search('anything_you_want_to_search')
for tweet in public_tweets:
    print(tweet.text)

In [None]:
analysis = TextBlob(tweet.text)

In [None]:
print(analysis.sentiment)