<a href="https://www.kaggle.com/code/lordshandilya/linguisticpatternrecognitiontopredictage-gender?scriptVersionId=170639260" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/blog-authorship-corpus/blogtext.csv


In [2]:
df = pd.read_csv('/kaggle/input/blog-authorship-corpus/blogtext.csv')

In [None]:
df =  df.drop(['id', 'topic', 'sign', 'date'], axis=1)

In [None]:
print(df.head())

In [None]:
df.to_csv('/kaggle/working//blogtext.csv', index=False)

In [5]:
import re
import nltk

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from multiprocessing import Pool, cpu_count

# Download NLTK stop words data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet', '/usr/share/nltk_data')

from nltk.corpus import wordnet
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Archive:  /usr/share/nltk_data/corpora/wordnet.zip
replace /usr/share/nltk_data/corpora/wordnet/lexnames? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


In [6]:
def remove_special_char(text):
    pattern = r'[^a-zA-Z0-9\s]'
    cleaned_text = re.sub(pattern, '', text)
    return cleaned_text
def preprocessing(text):
    
    text = remove_special_char(text)
    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    filtered_words = [lemmatizer.lemmatize(word.lower()) for word in words if word.lower() not in stop_words]
    filtered_text = ' '.join(filtered_words)
    return filtered_text

def process_chunk(chunk):
    chunk['text'] = chunk['text'].apply(preprocessing)
    return chunk

def apply_preprocessing(File):
    chunks = pd.read_csv(File, chunksize=1000)
    
    pool = Pool(cpu_count())
    
    processed_chunks = pool.map(process_chunk, chunks)
    
    pool.close()
    pool.join()
    
    processed_df = pd.concat(processed_chunks)
    
    processed_df.to_csv(File, index=False)

In [7]:
apply_preprocessing('/kaggle/working//blogtext.csv')

In [None]:
df = pd.read_csv('/kaggle/working//blogtext.csv')
print(df.head())


In [None]:
nan_values = df.isna().sum()

print("The number of NaN in each column is :")
print(nan_values)

In [None]:
df = df.dropna()

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, BertTokenizer
import torch.nn as nn
import torch.optim as optim

In [None]:

# Load the pre-trained BERT model
bert_model = BertModel.from_pretrained('bert-base-uncased')
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


In [None]:
# Freeze the BERT model layers
for param in bert_model.parameters():
    param.requires_grad = False

In [None]:
# Create the dataset class
class BlogDataset(Dataset):
    def __init__(self, df):
        self.texts = df['text'].tolist()
        self.genders = df['gender'].tolist()
        self.ages = df['age'].tolist()

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        gender = self.genders[idx]
        age = self.ages[idx]

        input_ids = bert_tokenizer.encode(text, add_special_tokens=True, max_length=128, truncation=True)
        attention_mask = [1] * len(input_ids)

        return torch.tensor(input_ids), torch.tensor(attention_mask), torch.tensor(gender).float(), torch.tensor(age).float()


In [None]:
# Create the model
class GenderAgeModel(nn.Module):
    def __init__(self):
        super(GenderAgeModel, self).__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(0.1)
        self.gender_output = nn.Linear(768, 1)
        self.age_output_1 = nn.Linear(768, 128)
        self.age_output_2 = nn.Linear(128, 64)
        self.age_output_3 = nn.Linear(64, 1)
        self.sigmoid = nn.Sigmoid()
        self.relu = nn.ReLU()

    def forward(self, input_ids, attention_mask):
        bert_output = self.bert(input_ids, attention_mask)[1]
        bert_output = self.dropout(bert_output)
        gender_output = self.sigmoid(self.gender_output(bert_output))
        age_output_1 = self.relu(self.age_output_1(bert_output))
        age_output_2 = self.relu(self.age_output_2(age_output_1))
        age_output_3 = self.age_output_3(age_output_2)
        return gender_output, age_output_3

In [None]:
# Load the data
df = pd.read_csv('/kaggle/working/blogtext.csv')
dataset = BlogDataset(df)


In [None]:
# Create the DataLoader
batch_size = 32
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)


In [None]:
# Initialize the model and define the loss functions and optimizer
model = GenderAgeModel()
gender_criterion = nn.BCELoss()
age_criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)


In [None]:
# Training loop
for epoch in range(16):
    for batch in train_loader:
        input_ids, attention_mask, gender_labels, age_labels = batch

        # Forward pass
        gender_output, age_output = model(input_ids, attention_mask)

        # Compute the losses
        gender_loss = gender_criterion(gender_output, gender_labels)
        age_loss = age_criterion(age_output, age_labels)

        # Backpropagation and optimization
        optimizer.zero_grad()
        (gender_loss + age_loss).backward()
        optimizer.step()

    # Print the losses
    print(f"Epoch [{epoch+1}/{num_epochs}], Gender Loss: {gender_loss.item()}, Age Loss: {age_loss.item()}")