In [1]:
import pandas as pd

train_df = pd.read_csv('train_utf8.csv')
test_df = pd.read_csv('test_utf8.csv')

In [2]:
# Drop rows with missing 'crimeaditionalinfo'
train_df = train_df.dropna(subset=['crimeaditionalinfo'])

# Fill missing 'sub_category' with a placeholder (e.g., 'Unknown')
train_df['sub_category'] = train_df['sub_category'].fillna('Unknown')

# Verify the changes
print("Updated Missing Values:")
print(train_df.isnull().sum())


Updated Missing Values:
category              0
sub_category          0
crimeaditionalinfo    0
dtype: int64


### Spell Checking

In [7]:
from symspellpy import SymSpell, Verbosity
from tqdm import tqdm
import pandas as pd

# Initialize SymSpell
max_edit_distance_dictionary = 2  # Maximum edit distance for lookups
prefix_length = 7  # Length of prefixes used for dictionary entries
sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)

# Load a dictionary file (pre-built dictionary for faster processing)
# Download frequency_dictionary_en_82_765.txt from https://github.com/mammothb/symspellpy
dictionary_path = "frequency_dictionary_en_82_765.txt"
term_index = 0  # Column of the term in the dictionary
count_index = 1  # Column of the term frequency
sym_spell.load_dictionary(dictionary_path, term_index, count_index)

# Function for spelling correction using SymSpell
def correct_spelling(text):
    if isinstance(text, str):
        suggestions = sym_spell.lookup_compound(text, max_edit_distance_dictionary)
        if suggestions:
            return suggestions[0].term  # Return the best suggestion
    return text  # Return as-is for non-string inputs

# Use tqdm to show progress
tqdm.pandas()

# Example DataFrames (replace with your actual data)
# train_df = ...
# test_df = ...

# Apply spelling correction with progress bar
train_df['corrected_text'] = train_df['crimeaditionalinfo'].progress_apply(correct_spelling)
test_df['corrected_text'] = test_df['crimeaditionalinfo'].progress_apply(correct_spelling)

# Check corrected text
print("Corrected Sample Complaints:")
print(train_df[['crimeaditionalinfo', 'corrected_text']].head())


2024-11-21 19:05:35,383: E symspellpy.symspellpy] Dictionary file not found at frequency_dictionary_en_82_765.txt.
100%|███████████████████████████████████████████████████████████████████████████| 93665/93665 [05:29<00:00, 284.66it/s]
100%|███████████████████████████████████████████████████████████████████████████| 31229/31229 [01:49<00:00, 285.48it/s]

Corrected Sample Complaints:
                                  crimeaditionalinfo  \
0  I had continue received random calls and abusi...   
1  The above fraudster is continuously messaging ...   
2  He is acting like a police and demanding for m...   
3  In apna Job I have applied for job interview f...   
4  I received a call from lady stating that she w...   

                                      corrected_text  
0  i had continue received random calls and abusi...  
1  the above fraudster is continuously messaging ...  
2  he is acting like a police and demanding for m...  
3  in apna job i have applied for job interview f...  
4  i received a call from lady stating that she w...  





### Tokenization

In [8]:
import nltk
from nltk.tokenize import word_tokenize
from tqdm import tqdm

# Download tokenizer if needed
nltk.download('punkt')

# Tokenization function
def tokenize_text(text):
    return word_tokenize(text) if isinstance(text, str) else []

# Use tqdm to show progress
tqdm.pandas()

# Apply tokenization with progress bar
train_df['tokens'] = train_df['corrected_text'].progress_apply(tokenize_text)
test_df['tokens'] = test_df['corrected_text'].progress_apply(tokenize_text)

# Check tokenized complaints
print("Tokenized Sample Complaints:")
print(train_df[['corrected_text', 'tokens']].head())


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
100%|██████████████████████████████████████████████████████████████████████████| 93665/93665 [00:54<00:00, 1721.46it/s]
100%|██████████████████████████████████████████████████████████████████████████| 31229/31229 [00:18<00:00, 1674.09it/s]

Tokenized Sample Complaints:
                                      corrected_text  \
0  i had continue received random calls and abusi...   
1  the above fraudster is continuously messaging ...   
2  he is acting like a police and demanding for m...   
3  in apna job i have applied for job interview f...   
4  i received a call from lady stating that she w...   

                                              tokens  
0  [i, had, continue, received, random, calls, an...  
1  [the, above, fraudster, is, continuously, mess...  
2  [he, is, acting, like, a, police, and, demandi...  
3  [in, apna, job, i, have, applied, for, job, in...  
4  [i, received, a, call, from, lady, stating, th...  





### Lemmatization

In [9]:
import nltk
from nltk.tokenize import word_tokenize
from tqdm import tqdm

# Download tokenizer if needed
nltk.download('punkt')

# Tokenization function
def tokenize_text(text):
    return word_tokenize(text) if isinstance(text, str) else []

# Use tqdm to show progress
tqdm.pandas()

# Apply tokenization with progress bar
train_df['tokens'] = train_df['corrected_text'].progress_apply(tokenize_text)
test_df['tokens'] = test_df['corrected_text'].progress_apply(tokenize_text)

# Check tokenized complaints
print("Tokenized Sample Complaints:")
print(train_df[['corrected_text', 'tokens']].head())


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
100%|██████████████████████████████████████████████████████████████████████████| 93665/93665 [00:52<00:00, 1794.47it/s]
100%|██████████████████████████████████████████████████████████████████████████| 31229/31229 [00:17<00:00, 1828.89it/s]


Tokenized Sample Complaints:
                                      corrected_text  \
0  i had continue received random calls and abusi...   
1  the above fraudster is continuously messaging ...   
2  he is acting like a police and demanding for m...   
3  in apna job i have applied for job interview f...   
4  i received a call from lady stating that she w...   

                                              tokens  
0  [i, had, continue, received, random, calls, an...  
1  [the, above, fraudster, is, continuously, mess...  
2  [he, is, acting, like, a, police, and, demandi...  
3  [in, apna, job, i, have, applied, for, job, in...  
4  [i, received, a, call, from, lady, stating, th...  


### POS Tagging

In [22]:
import nltk
nltk.download('averaged_perceptron_tagger')
from tqdm import tqdm
# Function for POS tagging
def pos_tag_tokens(tokens):
    return nltk.pos_tag(tokens)
tqdm.pandas()
# Apply POS tagging to create 'pos_tags' column
train_df['pos_tags'] = train_df['tokens'].apply(pos_tag_tokens)
test_df['pos_tags'] = test_df['tokens'].apply(pos_tag_tokens)

# Check POS tagging results
print("POS-Tagged Sample Complaints:")
print(train_df[['tokens', 'pos_tags']].head())


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


POS-Tagged Sample Complaints:
                                              tokens  \
0  [i, had, continue, received, random, calls, an...   
1  [the, above, fraudster, is, continuously, mess...   
2  [he, is, acting, like, a, police, and, demandi...   
3  [in, apna, job, i, have, applied, for, job, in...   
4  [i, received, a, call, from, lady, stating, th...   

                                            pos_tags  
0  [(i, NN), (had, VBD), (continue, VBN), (receiv...  
1  [(the, DT), (above, JJ), (fraudster, NN), (is,...  
2  [(he, PRP), (is, VBZ), (acting, VBG), (like, I...  
3  [(in, IN), (apna, JJ), (job, NN), (i, NNS), (h...  
4  [(i, NN), (received, VBD), (a, DT), (call, NN)...  


### Stop Word Removal Based on Tagging

In [25]:
# Define POS tags to keep
important_pos_tags = {'NN', 'NNS', 'NNP', 'NNPS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS'}

# Function to filter tokens based on POS tags
def filter_by_pos(pos_tags):
    return [word for word, tag in pos_tags if tag in important_pos_tags]
tqdm.pandas()
# Apply the filtering
train_df['filtered_tokens'] = train_df['pos_tags'].apply(filter_by_pos)
test_df['filtered_tokens'] = test_df['pos_tags'].apply(filter_by_pos)

# Check filtered tokens
print("Filtered Tokens Based on POS Tags (Sample):")
print(train_df[['pos_tags', 'filtered_tokens']].head())


Filtered Tokens Based on POS Tags (Sample):
                                            pos_tags  \
0  [(i, NN), (had, VBD), (continue, VBN), (receiv...   
1  [(the, DT), (above, JJ), (fraudster, NN), (is,...   
2  [(he, PRP), (is, VBZ), (acting, VBG), (like, I...   
3  [(in, IN), (apna, JJ), (job, NN), (i, NNS), (h...   
4  [(i, NN), (received, VBD), (a, DT), (call, NN)...   

                                     filtered_tokens  
0  [i, had, continue, received, random, calls, ab...  
1  [above, fraudster, is, continuously, messaging...  
2  [is, acting, police, demanding, money, adding,...  
3  [apna, job, i, have, applied, job, interview, ...  
4  [i, received, call, lady, stating, send, new, ...  


### Domain-Based Entity Tagging

In [26]:
from tqdm import tqdm
from joblib import Parallel, delayed

# Use tqdm to show progress
tqdm.pandas()

# Domain Entities Dictionary
domain_entities = {
    'Business Email CompromiseEmail Takeover': ['email', 'takeover', 'business'],
    'Cheating by Impersonation': ['cheating', 'impersonation', 'fraud'],
    'Cryptocurrency Fraud': ['cryptocurrency', 'bitcoin', 'crypto', 'fraud'],
    'Cyber Bullying  Stalking  Sexting': ['bullying', 'stalking', 'sexting', 'harassment'],
    'Cyber Terrorism': ['terrorism', 'cyber', 'extremism'],
    'Damage to computer computer systems etc': ['damage', 'computer', 'systems'],
    'Data Breach/Theft': ['data', 'breach', 'theft', 'hacking'],
    'DebitCredit Card FraudSim Swap Fraud': ['card', 'debit', 'credit', 'fraud', 'sim'],
    'Denial of Service (DoS)/Distributed Denial of Service (DDOS) attacks': ['ddos', 'dos', 'denial'],
    'EWallet Related Fraud': ['ewallet', 'fraud', 'wallet', 'money'],
    'Email Hacking': ['email', 'hacking', 'phishing'],
    'FakeImpersonating Profile': ['fake', 'impersonating', 'profile'],
    'Fraud CallVishing': ['fraud', 'call', 'vishing'],
    'Hacking/Defacement': ['hacking', 'defacement'],
    'Online Gambling  Betting': ['gambling', 'betting', 'online'],
    'Online Job Fraud': ['job', 'fraud', 'scam'],
    'Online Matrimonial Fraud': ['matrimonial', 'marriage', 'fraud'],
    'Ransomware': ['ransomware', 'attack', 'encryption'],
    'UPI Related Frauds': ['upi', 'fraud', 'transaction', 'money'],
    'Website DefacementHacking': ['website', 'hacking', 'defacement']
}

# Flatten the dictionary for NER tagging
entity_mapping = {word: sub_category for sub_category, words in domain_entities.items() for word in words}

# Tagging Function
def tag_domain_entities(tokens):
    return [(token, entity_mapping.get(token.lower(), 'O')) for token in tokens]

# Parallel Apply Function
def parallel_apply(df, func, column):
    return Parallel(n_jobs=-1)(delayed(func)(tokens) for tokens in df[column])

# Toggle for Method Selection
use_parallel = True  # Set to False to use tqdm with pandas

if use_parallel:
    # Apply tagging in parallel
    train_df['domain_tags'] = parallel_apply(train_df, tag_domain_entities, 'filtered_tokens')
    test_df['domain_tags'] = parallel_apply(test_df, tag_domain_entities, 'filtered_tokens')
else:
    # Apply tagging with tqdm progress bar
    train_df['domain_tags'] = train_df['filtered_tokens'].progress_apply(tag_domain_entities)
    test_df['domain_tags'] = test_df['filtered_tokens'].progress_apply(tag_domain_entities)

# Check domain tagging
print("Domain-Based Entity Tags (Sample):")
print(train_df[['filtered_tokens', 'domain_tags']].head())


Domain-Based Entity Tags (Sample):
                                     filtered_tokens  \
0  [i, had, continue, received, random, calls, ab...   
1  [above, fraudster, is, continuously, messaging...   
2  [is, acting, police, demanding, money, adding,...   
3  [apna, job, i, have, applied, job, interview, ...   
4  [i, received, call, lady, stating, send, new, ...   

                                         domain_tags  
0  [(i, O), (had, O), (continue, O), (received, O...  
1  [(above, O), (fraudster, O), (is, O), (continu...  
2  [(is, O), (acting, O), (police, O), (demanding...  
3  [(apna, O), (job, Online Job Fraud), (i, O), (...  
4  [(i, O), (received, O), (call, Fraud CallVishi...  


In [30]:
# Save the processed training data to a CSV file
train_df.to_csv('processed_train.csv', index=False)

# Save the processed test data to a CSV file
test_df.to_csv('processed_test.csv', index=False)

print("Processed data saved as CSV files:")
print("- Processed training data: processed_train.csv")
print("- Processed test data: processed_test.csv")


Processed data saved as CSV files:
- Processed training data: processed_train.csv
- Processed test data: processed_test.csv
