In [10]:
# Step 0: Install and Import Required Libraries
import pandas as pd
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import numpy as np

In [11]:
# Step 1: Download NLTK resources (only first time)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /home/argus/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/argus/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/argus/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/argus/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/argus/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [12]:
df = pd.read_csv("Social_Network_Ads.csv")

In [13]:
# Display the first few rows to understand the structure
print("Dataset preview:")
print(df.head())

Dataset preview:
    User ID  Gender  Age  EstimatedSalary  Purchased
0  15624510    Male   19            19000          0
1  15810944    Male   35            20000          0
2  15668575  Female   26            43000          0
3  15603246  Female   27            57000          0
4  15804002    Male   19            76000          0


In [14]:
# Check the column names to find text columns
print("\nColumns in the dataset:")
print(df.columns.tolist())


Columns in the dataset:
['User ID', 'Gender', 'Age', 'EstimatedSalary', 'Purchased']


In [15]:
# Step 3: Create Sample Text Data from Dataset
# Since Social_Network_Ads doesn't have much text, we'll create text descriptions
# Step 3.1: Generate text descriptions based on user demographics

def generate_description(row):
    gender = row['Gender']
    age = row['Age']
    salary = row['EstimatedSalary']
    purchased = row['Purchased']
    
    description = f"{gender} user aged {age} with estimated salary of {salary} "
    if purchased == 1:
        description += "purchased the product after seeing social media advertisements."
    else:
        description += "did not purchase the product despite seeing social media advertisements."
    
    return description

# Step 3.2: Create new column with text descriptions
df['Description'] = df.apply(generate_description, axis=1)

# Step 3.3: Display some sample descriptions
print("\nSample text descriptions:")
for i in range(5):
    print(f"Sample {i+1}: {df['Description'].iloc[i]}")


Sample text descriptions:
Sample 1: Male user aged 19 with estimated salary of 19000 did not purchase the product despite seeing social media advertisements.
Sample 2: Male user aged 35 with estimated salary of 20000 did not purchase the product despite seeing social media advertisements.
Sample 3: Female user aged 26 with estimated salary of 43000 did not purchase the product despite seeing social media advertisements.
Sample 4: Female user aged 27 with estimated salary of 57000 did not purchase the product despite seeing social media advertisements.
Sample 5: Male user aged 19 with estimated salary of 76000 did not purchase the product despite seeing social media advertisements.


In [22]:
nltk.download('averaged_perceptron_tagger_eng')

# Step 4: Define Text Preprocessing Function
def preprocess_text(text):
    # Step 4.1: Convert to lowercase
    text = text.lower()
    print(f"After lowercase: {text}")
    
    # Step 4.2: Tokenization
    tokens = word_tokenize(text)
    print(f"After tokenization: {tokens}")
    
    # Step 4.3: Remove punctuation
    tokens = [word for word in tokens if word not in string.punctuation]
    print(f"After removing punctuation: {tokens}")
    
    # Step 4.4: Remove Stop Words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    print(f"After removing stop words: {filtered_tokens}")
    
    # Step 4.5: POS Tagging
    pos_tags = pos_tag(filtered_tokens)
    print(f"POS tags: {pos_tags}")
    
    # Step 4.6: Stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
    print(f"After stemming: {stemmed_tokens}")
    
    # Step 4.7: Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    print(f"After lemmatization: {lemmatized_tokens}")
    
    return {
        'tokens': tokens,
        'filtered': filtered_tokens,
        'pos_tags': pos_tags,
        'stemmed': stemmed_tokens,
        'lemmatized': lemmatized_tokens
    }

# Step 4.8: Test the preprocessing function on one sample
print("\nTesting preprocessing function on the first description:")
sample_text = df['Description'].iloc[0]
print(f"Original text: {sample_text}")
result = preprocess_text(sample_text)


Testing preprocessing function on the first description:
Original text: Male user aged 19 with estimated salary of 19000 did not purchase the product despite seeing social media advertisements.
After lowercase: male user aged 19 with estimated salary of 19000 did not purchase the product despite seeing social media advertisements.
After tokenization: ['male', 'user', 'aged', '19', 'with', 'estimated', 'salary', 'of', '19000', 'did', 'not', 'purchase', 'the', 'product', 'despite', 'seeing', 'social', 'media', 'advertisements', '.']
After removing punctuation: ['male', 'user', 'aged', '19', 'with', 'estimated', 'salary', 'of', '19000', 'did', 'not', 'purchase', 'the', 'product', 'despite', 'seeing', 'social', 'media', 'advertisements']
After removing stop words: ['male', 'user', 'aged', '19', 'estimated', 'salary', '19000', 'purchase', 'product', 'despite', 'seeing', 'social', 'media', 'advertisements']
POS tags: [('male', 'JJ'), ('user', 'NN'), ('aged', 'VBD'), ('19', 'CD'), ('estimate

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/argus/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [23]:
# Step 5: Process Multiple Text Samples
# Step 5.1: Select a subset of descriptions to process
sample_descriptions = df['Description'].head(3).tolist()

# Step 5.2: Process each description
for i, text in enumerate(sample_descriptions):
    print(f"\n--- Sample {i+1} ---")
    print(f"Original: {text}")
    result = preprocess_text(text)
    
    # Display summary of results
    print("\nSummary:")
    print(f"Original Tokens: {len(result['tokens'])} tokens")
    print(f"After Stop Removal: {len(result['filtered'])} tokens")
    print(f"Parts of Speech: {result['pos_tags']}")
    print(f"After Stemming: {result['stemmed']}")
    print(f"After Lemmatization: {result['lemmatized']}")


--- Sample 1 ---
Original: Male user aged 19 with estimated salary of 19000 did not purchase the product despite seeing social media advertisements.
After lowercase: male user aged 19 with estimated salary of 19000 did not purchase the product despite seeing social media advertisements.
After tokenization: ['male', 'user', 'aged', '19', 'with', 'estimated', 'salary', 'of', '19000', 'did', 'not', 'purchase', 'the', 'product', 'despite', 'seeing', 'social', 'media', 'advertisements', '.']
After removing punctuation: ['male', 'user', 'aged', '19', 'with', 'estimated', 'salary', 'of', '19000', 'did', 'not', 'purchase', 'the', 'product', 'despite', 'seeing', 'social', 'media', 'advertisements']
After removing stop words: ['male', 'user', 'aged', '19', 'estimated', 'salary', '19000', 'purchase', 'product', 'despite', 'seeing', 'social', 'media', 'advertisements']
POS tags: [('male', 'JJ'), ('user', 'NN'), ('aged', 'VBD'), ('19', 'CD'), ('estimated', 'VBN'), ('salary', 'JJ'), ('19000', 'CD')

In [24]:

# Step 6: Perform TF-IDF Vectorization
# Step 6.1: Prepare corpus by preprocessing all descriptions
print("\n# Step 6.1: Preparing corpus...")
preprocessed_docs = []
# Using only 5 descriptions to keep output manageable
for idx, text in enumerate(df['Description'].head(5)):
    print(f"\nPreprocessing document {idx+1}...")
    preprocessed = preprocess_text(text)
    preprocessed_docs.append(' '.join(preprocessed['lemmatized']))

# Step 6.2: Apply TF-IDF Vectorization
print("\n# Step 6.2: Applying TF-IDF Vectorization...")
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_docs)

# Step 6.3: Get feature names (vocabulary)
feature_names = tfidf_vectorizer.get_feature_names_out()
print(f"\nVocabulary (unique words): {feature_names}")

# Step 6.4: Display TF-IDF matrix
print("\nTF-IDF Matrix shape:", tfidf_matrix.shape)
print("TF-IDF Matrix (first 5 rows):")
print(tfidf_matrix.toarray())


# Step 6.1: Preparing corpus...

Preprocessing document 1...
After lowercase: male user aged 19 with estimated salary of 19000 did not purchase the product despite seeing social media advertisements.
After tokenization: ['male', 'user', 'aged', '19', 'with', 'estimated', 'salary', 'of', '19000', 'did', 'not', 'purchase', 'the', 'product', 'despite', 'seeing', 'social', 'media', 'advertisements', '.']
After removing punctuation: ['male', 'user', 'aged', '19', 'with', 'estimated', 'salary', 'of', '19000', 'did', 'not', 'purchase', 'the', 'product', 'despite', 'seeing', 'social', 'media', 'advertisements']
After removing stop words: ['male', 'user', 'aged', '19', 'estimated', 'salary', '19000', 'purchase', 'product', 'despite', 'seeing', 'social', 'media', 'advertisements']
POS tags: [('male', 'JJ'), ('user', 'NN'), ('aged', 'VBD'), ('19', 'CD'), ('estimated', 'VBN'), ('salary', 'JJ'), ('19000', 'CD'), ('purchase', 'NN'), ('product', 'NN'), ('despite', 'IN'), ('seeing', 'VBG'), ('social'

In [25]:
# Step 7: Analyze TF-IDF Scores
# Step 7.1: Show TF-IDF scores for each word in each document
print("\n# Step 7.1: Analyzing TF-IDF scores...")
for i, doc in enumerate(preprocessed_docs):
    print(f"\nDocument {i+1} TF-IDF scores:")
    feature_index = tfidf_matrix[i,:].nonzero()[1]
    tfidf_scores = zip(feature_index, [tfidf_matrix[i, x] for x in feature_index])
    for word_idx, score in sorted(tfidf_scores, key=lambda x: x[1], reverse=True):
        print(f"{feature_names[word_idx]}: {score:.4f}")


# Step 7.1: Analyzing TF-IDF scores...

Document 1 TF-IDF scores:
19000: 0.4664
19: 0.3763
male: 0.3124
user: 0.2222
aged: 0.2222
estimated: 0.2222
salary: 0.2222
purchase: 0.2222
product: 0.2222
despite: 0.2222
seeing: 0.2222
social: 0.2222
medium: 0.2222
advertisement: 0.2222

Document 2 TF-IDF scores:
35: 0.4496
20000: 0.4496
male: 0.3011
user: 0.2143
aged: 0.2143
estimated: 0.2143
salary: 0.2143
purchase: 0.2143
product: 0.2143
despite: 0.2143
seeing: 0.2143
social: 0.2143
medium: 0.2143
advertisement: 0.2143

Document 3 TF-IDF scores:
26: 0.4407
43000: 0.4407
female: 0.3556
user: 0.2100
aged: 0.2100
estimated: 0.2100
salary: 0.2100
purchase: 0.2100
product: 0.2100
despite: 0.2100
seeing: 0.2100
social: 0.2100
medium: 0.2100
advertisement: 0.2100

Document 4 TF-IDF scores:
27: 0.4407
57000: 0.4407
female: 0.3556
user: 0.2100
aged: 0.2100
estimated: 0.2100
salary: 0.2100
purchase: 0.2100
product: 0.2100
despite: 0.2100
seeing: 0.2100
social: 0.2100
medium: 0.2100
advertisement: 0.2

In [26]:
# Step 8: Apply Bag of Words (Count Vectorization)
# Step 8.1: Use CountVectorizer on the preprocessed documents
print("\n# Step 8.1: Applying Bag of Words...")
count_vectorizer = CountVectorizer()
count_matrix = count_vectorizer.fit_transform(preprocessed_docs)

# Step 8.2: Get feature names
count_features = count_vectorizer.get_feature_names_out()
print(f"\nCount Vectorizer Vocabulary: {count_features}")

# Step 8.3: Display Count matrix
print("\nCount Matrix (Bag of Words):")
print(count_matrix.toarray())


# Step 8.1: Applying Bag of Words...

Count Vectorizer Vocabulary: ['19' '19000' '20000' '26' '27' '35' '43000' '57000' '76000'
 'advertisement' 'aged' 'despite' 'estimated' 'female' 'male' 'medium'
 'product' 'purchase' 'salary' 'seeing' 'social' 'user']

Count Matrix (Bag of Words):
[[1 1 0 0 0 0 0 0 0 1 1 1 1 0 1 1 1 1 1 1 1 1]
 [0 0 1 0 0 1 0 0 0 1 1 1 1 0 1 1 1 1 1 1 1 1]
 [0 0 0 1 0 0 1 0 0 1 1 1 1 1 0 1 1 1 1 1 1 1]
 [0 0 0 0 1 0 0 1 0 1 1 1 1 1 0 1 1 1 1 1 1 1]
 [1 0 0 0 0 0 0 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1]]


In [27]:
# Step 9: Analyze Word Frequencies
# Step 9.1: Show word counts for each document
print("\n# Step 9.1: Analyzing word frequencies...")
for i, doc in enumerate(preprocessed_docs):
    print(f"\nDocument {i+1} word counts:")
    feature_index = count_matrix[i,:].nonzero()[1]
    count_scores = zip(feature_index, [count_matrix[i, x] for x in feature_index])
    for word_idx, count in sorted(count_scores, key=lambda x: x[1], reverse=True):
        print(f"{count_features[word_idx]}: {count}")


# Step 9.1: Analyzing word frequencies...

Document 1 word counts:
male: 1
user: 1
aged: 1
19: 1
estimated: 1
salary: 1
19000: 1
purchase: 1
product: 1
despite: 1
seeing: 1
social: 1
medium: 1
advertisement: 1

Document 2 word counts:
male: 1
user: 1
aged: 1
estimated: 1
salary: 1
purchase: 1
product: 1
despite: 1
seeing: 1
social: 1
medium: 1
advertisement: 1
35: 1
20000: 1

Document 3 word counts:
user: 1
aged: 1
estimated: 1
salary: 1
purchase: 1
product: 1
despite: 1
seeing: 1
social: 1
medium: 1
advertisement: 1
female: 1
26: 1
43000: 1

Document 4 word counts:
user: 1
aged: 1
estimated: 1
salary: 1
purchase: 1
product: 1
despite: 1
seeing: 1
social: 1
medium: 1
advertisement: 1
female: 1
27: 1
57000: 1

Document 5 word counts:
male: 1
user: 1
aged: 1
19: 1
estimated: 1
salary: 1
purchase: 1
product: 1
despite: 1
seeing: 1
social: 1
medium: 1
advertisement: 1
76000: 1


In [28]:
# Step 10: Compare Documents Using TF-IDF
# Step 10.1: Calculate document similarity using cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

print("\n# Step 10.1: Calculating document similarity...")
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print("\nCosine Similarity Matrix:")
print(cosine_sim)

# Step 10.2: Identify most similar documents
print("\n# Step 10.2: Most similar document pairs:")
for i in range(len(preprocessed_docs)):
    for j in range(i+1, len(preprocessed_docs)):
        print(f"Similarity between Document {i+1} and Document {j+1}: {cosine_sim[i][j]:.4f}")


# Step 10.1: Calculating document similarity...

Cosine Similarity Matrix:
[[1.         0.61784642 0.51338811 0.51338811 0.7824697 ]
 [0.61784642 1.         0.49493996 0.49493996 0.61784642]
 [0.51338811 0.49493996 1.         0.61154089 0.51338811]
 [0.51338811 0.49493996 0.61154089 1.         0.51338811]
 [0.7824697  0.61784642 0.51338811 0.51338811 1.        ]]

# Step 10.2: Most similar document pairs:
Similarity between Document 1 and Document 2: 0.6178
Similarity between Document 1 and Document 3: 0.5134
Similarity between Document 1 and Document 4: 0.5134
Similarity between Document 1 and Document 5: 0.7825
Similarity between Document 2 and Document 3: 0.4949
Similarity between Document 2 and Document 4: 0.4949
Similarity between Document 2 and Document 5: 0.6178
Similarity between Document 3 and Document 4: 0.6115
Similarity between Document 3 and Document 5: 0.5134
Similarity between Document 4 and Document 5: 0.5134
