In [1]:
pip install nltk pandas



In [15]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from collections import Counter

# Ensure you have the necessary NLTK resources
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Read the CSV file
df = pd.read_csv('mail_data.csv')

# Function to map POS tags to their full names
def map_pos_to_full_name(tag):
    pos_full_names = {
        'CC': 'Coordinating conjunction', 'CD': 'Cardinal number', 'DT': 'Determiner', 'EX': 'Existential there',
        'FW': 'Foreign word', 'IN': 'Preposition or subordinating conjunction', 'JJ': 'Adjective',
        'JJR': 'Adjective, comparative', 'JJS': 'Adjective, superlative', 'LS': 'List item marker',
        'MD': 'Modal', 'NN': 'Noun, singular or mass', 'NNS': 'Noun, plural', 'NNP': 'Proper noun, singular',
        'NNPS': 'Proper noun, plural', 'PDT': 'Predeterminer', 'POS': 'Possessive ending', 'PRP': 'Personal pronoun',
        'PRP$': 'Possessive pronoun', 'RB': 'Adverb', 'RBR': 'Adverb, comparative', 'RBS': 'Adverb, superlative',
        'RP': 'Particle', 'SYM': 'Symbol', 'TO': 'to', 'UH': 'Interjection', 'VB': 'Verb, base form',
        'VBD': 'Verb, past tense', 'VBG': 'Verb, gerund or present participle', 'VBN': 'Verb, past participle',
        'VBP': 'Verb, non-3rd person singular present', 'VBZ': 'Verb, 3rd person singular present',
        'WDT': 'Wh-determiner', 'WP': 'Wh-pronoun', 'WP$': 'Possessive wh-pronoun', 'WRB': 'Wh-adverb'
    }
    return pos_full_names.get(tag, tag)

# Function to extract POS tags and their counts with full names
def pos_features(Message):
    tokens = word_tokenize(Message)
    pos_tags = pos_tag(tokens)
    pos_counts = Counter(map_pos_to_full_name(tag) for word, tag in pos_tags)
    return pos_counts

# Function to tag each word with its POS and map to full names
def tag_words_with_pos(Message):
    tokens = word_tokenize(Message)
    pos_tags = pos_tag(tokens)
    tagged_words = [(word, map_pos_to_full_name(tag)) for word, tag in pos_tags]
    return tagged_words

# Apply the pos_features function to the 'Processed_Message' column
df['POS_Tags'] = df['Message'].apply(pos_features)

# Apply the tag_words_with_pos function to the 'Processed_Message' column
df['Tagged_Words'] = df['Message'].apply(tag_words_with_pos)

# Create a DataFrame with POS tag features
pos_df = pd.DataFrame(df['POS_Tags'].tolist()).fillna(0)

# Combine the original DataFrame with the POS features DataFrame
df = pd.concat([df, pos_df], axis=1).drop(columns=['POS_Tags'])

# Save the enhanced DataFrame to a new CSV file
df.to_csv('enhanced_mail_data_with_tagged_words.csv', index=False)

# Display the first few rows of the enhanced DataFrame
print(df.head())


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


  Category                                            Message  \
0      ham  Go until jurong point, crazy.. Available only ...   
1      ham                      Ok lar... Joking wif u oni...   
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...   
3      ham  U dun say so early hor... U c already then say...   
4      ham  Nah I don't think he goes to usf, he lives aro...   

                                        Tagged_Words  Proper noun, singular  \
0  [(Go, Proper noun, singular), (until, Preposit...                    3.0   
1  [(Ok, Proper noun, singular), (lar, Noun, sing...                    2.0   
2  [(Free, Adjective), (entry, Noun, singular or ...                    6.0   
3  [(U, Adjective), (dun, Noun, plural), (say, Ve...                    1.0   
4  [(Nah, Proper noun, singular), (I, Personal pr...                    1.0   

   Preposition or subordinating conjunction  Adjective  \
0                                       2.0        3.0   
1                 

In [9]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from collections import Counter

# Ensure you have the necessary NLTK resources
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Read the CSV file
df = pd.read_csv('mail_data.csv')

# Function to map POS tags to their full names
def map_pos_to_full_name(tag):
    pos_full_names = {
        'CC': 'Coordinating conjunction', 'CD': 'Cardinal number', 'DT': 'Determiner', 'EX': 'Existential there',
        'FW': 'Foreign word', 'IN': 'Preposition or subordinating conjunction', 'JJ': 'Adjective',
        'JJR': 'Adjective, comparative', 'JJS': 'Adjective, superlative', 'LS': 'List item marker',
        'MD': 'Modal', 'NN': 'Noun, singular or mass', 'NNS': 'Noun, plural', 'NNP': 'Proper noun, singular',
        'NNPS': 'Proper noun, plural', 'PDT': 'Predeterminer', 'POS': 'Possessive ending', 'PRP': 'Personal pronoun',
        'PRP$': 'Possessive pronoun', 'RB': 'Adverb', 'RBR': 'Adverb, comparative', 'RBS': 'Adverb, superlative',
        'RP': 'Particle', 'SYM': 'Symbol', 'TO': 'to', 'UH': 'Interjection', 'VB': 'Verb, base form',
        'VBD': 'Verb, past tense', 'VBG': 'Verb, gerund or present participle', 'VBN': 'Verb, past participle',
        'VBP': 'Verb, non-3rd person singular present', 'VBZ': 'Verb, 3rd person singular present',
        'WDT': 'Wh-determiner', 'WP': 'Wh-pronoun', 'WP$': 'Possessive wh-pronoun', 'WRB': 'Wh-adverb'
    }
    return pos_full_names.get(tag, tag)

# Function to extract POS tags and their counts with full names
def pos_features(text):
    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)
    pos_counts = Counter(map_pos_to_full_name(tag) for word, tag in pos_tags)
    return pos_counts

# Apply the pos_features function to the 'Processed_Message' column
df['POS_Tags'] = df['Message'].apply(pos_features)

# Create a DataFrame with POS tag features
pos_df = pd.DataFrame(df['POS_Tags'].tolist()).fillna(0)

# Combine the original DataFrame with the POS features DataFrame
df = pd.concat([df, pos_df], axis=1).drop(columns=['POS_Tags'])

# Save the enhanced DataFrame to a new CSV file
df.to_csv('11enhanced_mail_data_with_pos_features.csv', index=False)

# Display the first few rows of the enhanced DataFrame
print(df.head())


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


  Category                                            Message  \
0      ham  Go until jurong point, crazy.. Available only ...   
1      ham                      Ok lar... Joking wif u oni...   
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...   
3      ham  U dun say so early hor... U c already then say...   
4      ham  Nah I don't think he goes to usf, he lives aro...   

   Proper noun, singular  Preposition or subordinating conjunction  Adjective  \
0                    3.0                                       2.0        3.0   
1                    2.0                                       0.0        1.0   
2                    6.0                                       1.0        4.0   
3                    1.0                                       0.0        2.0   
4                    1.0                                       1.0        0.0   

   Noun, singular or mass    ,  Adverb  Foreign word    :  ...  Predeterminer  \
0                     7.0  1.0     3.0   