In [2]:
import pandas as pd

twitter_data = pd.read_csv(r"D:\My Projects 1\Dataanalytics (begining)\oasisProjectfile\Twitter_Data.csv")

# Display the first few rows of the dataframe to understand its structure
twitter_data.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


# preprocessing step

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text)
    # Convert to lower case and remove punctuation
    tokens = [word.lower() for word in tokens if word.isalpha()]
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize the tokens
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Join the tokens back into a single string
    return ' '.join(tokens)

# Apply preprocessing to the text column
twitter_data['processed_text'] = twitter_data['clean_text'].apply(preprocess_text)

# Display the first few rows to verify preprocessing
twitter_data[['clean_text', 'processed_text', 'category']].head()

# following steps manually for text preprocessing:

In [6]:
# Tokenization using basic Python string methods.
# Removal of stopwords using a predefined list.
# Lowercasing and removal of punctuation.
# Lemmatization using a simple lemmatizer.

In [8]:
# Predefined stopwords list (a small sample for demonstration)
stop_words = {
    'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 
    'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 
    'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 
    'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 
    'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 
    'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 
    'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 
    'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 
    'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 
    'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 
    'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 
    'don', 'should', 'now'
}

# Simple lemmatizer dictionary (a small sample for demonstration)
lemmatizer_dict = {
    'am': 'be', 'is': 'be', 'are': 'be', 'was': 'be', 'were': 'be', 'has': 'have', 'had': 'have', 
    'does': 'do', 'did': 'do', 'doing': 'do', 'goes': 'go', 'went': 'go', 'gone': 'go', 'going': 'go',
    'see': 'see', 'saw': 'see', 'seen': 'see', 'seeing': 'see', 'buy': 'buy', 'bought': 'buy', 
    'buying': 'buy', 'get': 'get', 'got': 'get', 'gotten': 'get', 'getting': 'get', 'make': 'make', 
    'made': 'make', 'making': 'make'
}

def simple_lemmatizer(word):
    return lemmatizer_dict.get(word, word)

def preprocess_text_simple(text):
    # Remove punctuation and lowercase the text
    text = text.translate(str.maketrans('', '', string.punctuation)).lower()
    # Tokenize the text
    tokens = text.split()
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize the tokens
    tokens = [simple_lemmatizer(word) for word in tokens]
    # Join the tokens back into a single string
    return ' '.join(tokens)

# Apply preprocessing to the text column
twitter_data['processed_text'] = twitter_data['clean_text'].apply(preprocess_text_simple)

# Display the first few rows to verify preprocessing
twitter_data[['clean_text', 'processed_text', 'category']].head()


AttributeError: 'float' object has no attribute 'translate'

# It seems there are some missing values or non-string entries in the clean_text column. Let's handle these by converting all entries to strings and replacing any missing values with an empty string before preprocessing.

In [None]:
# Let's preprocess the text data again with these adjustments

In [10]:
# Convert all entries to strings and handle missing values
twitter_data['clean_text'] = twitter_data['clean_text'].fillna('').astype(str)

def preprocess_text_simple(text):
    # Remove punctuation and lowercase the text
    text = text.translate(str.maketrans('', '', string.punctuation)).lower()
    # Tokenize the text
    tokens = text.split()
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize the tokens
    tokens = [simple_lemmatizer(word) for word in tokens]
    # Join the tokens back into a single string
    return ' '.join(tokens)

# Apply preprocessing to the text column
twitter_data['processed_text'] = twitter_data['clean_text'].apply(preprocess_text_simple)

# Display the first few rows to verify preprocessing
twitter_data[['clean_text', 'processed_text', 'category']].head()

Unnamed: 0,clean_text,processed_text,category
0,when modi promised “minimum government maximum...,modi promised “minimum government maximum gove...,-1.0
1,talk all the nonsense and continue all the dra...,talk nonsense continue drama vote modi,0.0
2,what did just say vote for modi welcome bjp t...,say vote modi welcome bjp told rahul main camp...,1.0
3,asking his supporters prefix chowkidar their n...,asking supporters prefix chowkidar names modi ...,1.0
4,answer who among these the most powerful world...,answer among powerful world leader today trump...,1.0
