<a href="https://colab.research.google.com/github/33Surya66/33Surya66/blob/main/Segmentation_Demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
!pip install faker  # Install faker if you haven't already

import pandas as pd
from faker import Faker
import re

def generate_10_digit_phone():
    """Generates a random 10-digit phone number."""
    fake = Faker()
    while True:
        phone = fake.phone_number()
        # Remove non-digit characters and check length
        cleaned_phone = re.sub(r'\D', '', phone)
        if len(cleaned_phone) == 10:
            return cleaned_phone

def has_4_wheeler(age):
    """Randomly determines if a person has a 4-wheeler based on age."""
    fake = Faker()
    if age > 18:
        return fake.boolean(chance_of_getting_true=50)  # 50% chance
    else:
        return False

# Load your CSV file into a pandas DataFrame
try:
    df = pd.read_csv('/content/train.csv')
except FileNotFoundError:
    print("Error: The file '/content/train.csv' was not found.")
    # Handle the file not found error, e.g., exit or use a sample DataFrame
    df = pd.DataFrame() # Create an empty DataFrame if the file is not found

# Add new columns with fake data using Faker methods
if not df.empty: # Proceed only if the DataFrame is not empty
    df['Phone'] = [generate_10_digit_phone() for _ in range(len(df))]  # Use the custom function
    df['Email'] = [fake.email() for _ in range(len(df))]
    df['Address'] = [fake.address() for _ in range(len(df))]
    # ... add more columns with other Faker providers as needed ...

    # Add 'Has_4_Wheeler' column based on age
    df['Has_4_Wheeler'] = df['age'].apply(has_4_wheeler)  # Assuming you have an 'Age' column

    print(df.head(200)) # Print the first few rows of the updated DataFrame
# ... (rest of your code)

     age           job   marital  education default  balance housing loan  \
0     58    management   married   tertiary      no     2143     yes   no   
1     44    technician    single  secondary      no       29     yes   no   
2     33  entrepreneur   married  secondary      no        2     yes  yes   
3     47   blue-collar   married    unknown      no     1506     yes   no   
4     33       unknown    single    unknown      no        1      no   no   
..   ...           ...       ...        ...     ...      ...     ...  ...   
195   33   blue-collar    single  secondary      no      307     yes   no   
196   38      services   married  secondary      no      155     yes   no   
197   50    technician  divorced   tertiary      no      173      no  yes   
198   43    management   married   tertiary      no      400     yes   no   
199   61   blue-collar  divorced    primary      no     1428     yes   no   

     contact  day  ... duration  campaign  pdays  previous  poutcome   y  \

In [12]:
print(df.head(200))
print(len(df))

      age           job   marital  education default  balance housing loan  \
0    58.0    management   married   tertiary      no   2143.0     yes   no   
1    44.0    technician    single  secondary      no     29.0     yes   no   
2    33.0  entrepreneur   married  secondary      no      2.0     yes  yes   
3    47.0   blue-collar   married    unknown      no   1506.0     yes   no   
4    33.0       unknown    single    unknown      no      1.0      no   no   
..    ...           ...       ...        ...     ...      ...     ...  ...   
195  33.0   blue-collar    single  secondary      no    307.0     yes   no   
196  38.0      services   married  secondary      no    155.0     yes   no   
197  50.0    technician  divorced   tertiary      no    173.0      no  yes   
198  43.0    management   married   tertiary      no    400.0     yes   no   
199  61.0   blue-collar  divorced    primary      no   1428.0     yes   no   

     contact  day month  duration  campaign  pdays  previous po

In [8]:
# prompt: clean data

import pandas as pd
import numpy as np

# Assuming 'df' is your DataFrame (loaded previously)
# Replace NaN values with a specific value (e.g., 0)
df.fillna(0, inplace=True)

# Remove duplicate rows
df.drop_duplicates(inplace=True)


# Convert columns to appropriate data types
for col in df.columns:
    if pd.api.types.is_numeric_dtype(df[col]):
        # Handle non-numeric values in numeric columns
        df[col] = pd.to_numeric(df[col], errors='coerce')
        df[col] = df[col].fillna(0)  # Fill NaN values with 0
    elif pd.api.types.is_string_dtype(df[col]):
        df[col] = df[col].str.strip() # Remove leading/trailing spaces
        # Handle invalid strings (e.g., replace with empty string)
        df[col] = df[col].replace(r'[^\x00-\x7F]+', '', regex=True)

# Example of outlier removal (using IQR method)
def remove_outliers_iqr(data):
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return data[(data >= lower_bound) & (data <= upper_bound)]

for col in df.select_dtypes(include=np.number).columns:
    df[col] = remove_outliers_iqr(df[col])

# Fill NaN values resulting from outlier removal with mean for numeric columns
for col in df.select_dtypes(include=np.number).columns:
    df[col] = df[col].fillna(df[col].mean())

print(df.head(200))

      age           job   marital  education default  balance housing loan  \
0    58.0    management   married   tertiary      no   2143.0     yes   no   
1    44.0    technician    single  secondary      no     29.0     yes   no   
2    33.0  entrepreneur   married  secondary      no      2.0     yes  yes   
3    47.0   blue-collar   married    unknown      no   1506.0     yes   no   
4    33.0       unknown    single    unknown      no      1.0      no   no   
..    ...           ...       ...        ...     ...      ...     ...  ...   
195  33.0   blue-collar    single  secondary      no    307.0     yes   no   
196  38.0      services   married  secondary      no    155.0     yes   no   
197  50.0    technician  divorced   tertiary      no    173.0      no  yes   
198  43.0    management   married   tertiary      no    400.0     yes   no   
199  61.0   blue-collar  divorced    primary      no   1428.0     yes   no   

     contact  day month  duration  campaign  pdays  previous po

In [13]:
# prompt: resulting number of rows

import pandas as pd
from faker import Faker
import re
import numpy as np

# ... (rest of your code) ...

print(len(df))

45211


In [None]:
# prompt: clean data further

import pandas as pd
from faker import Faker
import re
import numpy as np

!pip install faker  # Install faker if you haven't already

def generate_10_digit_phone():
    """Generates a random 10-digit phone number."""
    fake = Faker()
    while True:
        phone = fake.phone_number()
        cleaned_phone = re.sub(r'\D', '', phone)
        if len(cleaned_phone) == 10:
            return cleaned_phone

try:
    df = pd.read_csv('/content/train.csv')
except FileNotFoundError:
    print("Error: The file '/content/train.csv' was not found.")
    df = pd.DataFrame()

fake = Faker() # Initialize Faker object outside the loop for better performance

if not df.empty:
    df['Phone'] = [generate_10_digit_phone() for _ in range(len(df))]
    df['Email'] = [fake.email() for _ in range(len(df))]
    df['Address'] = [fake.address() for _ in range(len(df))]

    # ... other column additions ...

# Data Cleaning Enhancements
if not df.empty:
    # Handle missing values more robustly
    for col in df.columns:
        if pd.api.types.is_numeric_dtype(df[col]):
            df[col] = pd.to_numeric(df[col], errors='coerce')
            # Use median instead of mean for outlier-resistant imputation
            df[col] = df[col].fillna(df[col].median())
        elif pd.api.types.is_string_dtype(df[col]):
            df[col] = df[col].astype(str).str.strip()
            df[col] = df[col].replace(r'[^\x00-\x7F]+', '', regex=True) # Remove non-ASCII chars
            df[col] = df[col].fillna('') # Fill NaN with empty string

    # Improved outlier removal (handling potential errors gracefully)
    for col in df.select_dtypes(include=np.number).columns:
        try:
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            df[col] = df[col][(df[col] >= lower_bound) & (df[col] <= upper_bound)]
            df[col] = df[col].fillna(df[col].median()) # Fill with median after outlier removal
        except Exception as e: #Catch any exception during outlier removal
            print(f"Error processing column '{col}': {e}")

    # Remove duplicate rows (after cleaning)
    df.drop_duplicates(inplace=True)


print(df.head(200))
print(len(df))

In [None]:
!pip install PyPDF2  # Install PyPDF2 for PDF extraction
!pip install nltk
!pip install requests
!pip install beautifulsoup4
!pip install scikit-learn

import PyPDF2
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
import re

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_file):
    with open(pdf_file, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            text += page.extract_text()
        return text

# Function to clean text (remove special characters, numbers, and stopwords)
def clean_text(text):
    text = re.sub(r'\W+', ' ', text)
    text = re.sub(r'\d+', '', text)
    text = text.lower()
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Function to load user data from PDF (assuming tabular data)
def load_user_data_from_pdf(pdf_file):
    # Assume the PDF contains tabular data that can be converted to a DataFrame
    pdf_text = extract_text_from_pdf(pdf_file)
    # ... (Logic to convert pdf_text to a DataFrame)
    # You may need to use libraries like camelot, tabula-py, or regular expressions
    # to extract the tabular data from the PDF text.
    # Example (using pandas read_csv with a StringIO object):
    # from io import StringIO
    # df = pd.read_csv(StringIO(pdf_text), separator=',')
    # ... (Other processing to clean and format the DataFrame)
    return df  # Return the created DataFrame

# Function to match users to the policy based on text similarity
def find_target_audience(user_data, policy_text, user_text_column='Description'):  # Assuming 'Description' column in user data
    vectorizer = TfidfVectorizer()
    user_vectors = vectorizer.fit_transform(user_data[user_text_column].fillna(''))
    policy_vector = vectorizer.transform([policy_text])
    similarities = cosine_similarity(user_vectors, policy_vector)
    user_data['Relevance'] = similarities
    sorted_users = user_data.sort_values(by='Relevance', ascending=False)
    return sorted_users

# Function to save results to a CSV file
def save_results(target_audience, output_file):
    target_audience.to_csv(output_file, index=False)
    print(f"Target audience saved to {output_file}")

# Main script
def main():
    user_data_pdf = '/content/train.csv'  # Path to the user data PDF
    policy_pdf = '/content/policy_document.pdf/content/bajaj-allianz-car-insurance-policy-brochurepdf.pdf'  # Path to the policy document PDF
    output_file = '/content/target_audience.csv'  # Path to save the results

    try:
        # Load user data from PDF
        user_data = load_user_data_from_pdf(user_data_pdf)
        print("User data loaded successfully!")

        # Extract policy text from PDF
        policy_text = extract_text_from_pdf(policy_pdf)
        policy_text = clean_text(policy_text)  # Clean the policy text
        print("Policy text extracted and cleaned successfully!")

        # Find target audience
        target_audience = find_target_audience(user_data, policy_text)
        print("Target audience identified successfully!")

        # Save results
        save_results(target_audience, output_file)
    except Exception as e:
        print(f"Error: {e}")

if __name__ == "__main__":
    main()

In [20]:
!pip install faker  # Install faker if you haven't already
!pip install PyPDF2  # Install PyPDF2 for PDF extraction
!pip install nltk
!pip install requests
!pip install beautifulsoup4
!pip install scikit-learn

import pandas as pd
from faker import Faker
import re
import numpy as np
import PyPDF2
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))


def generate_10_digit_phone():
    """Generates a random 10-digit phone number."""
    fake = Faker()
    while True:
        phone = fake.phone_number()
        # Remove non-digit characters and check length
        cleaned_phone = re.sub(r'\D', '', phone)
        if len(cleaned_phone) == 10:
            return cleaned_phone


def has_4_wheeler(age):
    """Randomly determines if a person has a 4-wheeler based on age."""
    fake = Faker()
    if age > 18:
        return fake.boolean(chance_of_getting_true=50)  # 50% chance
    else:
        return False


def extract_text_from_pdf(pdf_file):
    with open(pdf_file, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            text += page.extract_text()
        return text


def clean_text(text):
    text = re.sub(r'\W+', ' ', text)
    text = re.sub(r'\d+', '', text)
    text = text.lower()
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)


def load_user_data_from_csv(csv_file):
    try:
        df = pd.read_csv(csv_file)
        # Add new columns with fake data using Faker methods
        if not df.empty:
            df['Phone'] = [generate_10_digit_phone()
                          for _ in range(len(df))]
            df['Email'] = [fake.email() for _ in range(len(df))]
            df['Address'] = [fake.address() for _ in range(len(df))]
            # Add 'Has_4_Wheeler' column based on age
            df['Has_4_Wheeler'] = df['age'].apply(has_4_wheeler)
        return df
    except FileNotFoundError:
        print(f"Error: The file '{csv_file}' was not found.")
        return pd.DataFrame()  # Return an empty DataFrame if file not found


def find_target_audience(user_data, policy_text, user_text_column='Description'):
    vectorizer = TfidfVectorizer()
    user_vectors = vectorizer.fit_transform(
        user_data[user_text_column].fillna(''))
    policy_vector = vectorizer.transform([policy_text])
    similarities = cosine_similarity(user_vectors, policy_vector)
    user_data['Relevance'] = similarities
    sorted_users = user_data.sort_values(by='Relevance', ascending=False)
    return sorted_users


def save_results(target_audience, output_file):
    target_audience.to_csv(output_file, index=False)
    print(f"Target audience saved to {output_file}")


def main():
    user_data_csv = '/content/train.csv'
    policy_pdf = '/content/bajaj-allianz-car-insurance-policy-brochurepdf.pdf'
    output_file = '/content/target_audience.csv'

    try:
        # Load user data from CSV
        user_data = load_user_data_from_csv(user_data_csv)
        print("User data loaded successfully!")

        # Filter users who do not have a 4-wheeler
        user_data_no_4wheeler = user_data[user_data['Has_4_Wheeler'] == False]
        print("Filtered users without 4-wheelers.")

        # Extract policy text from PDF
        policy_text = extract_text_from_pdf(policy_pdf)
        policy_text = clean_text(policy_text)
        print("Policy text extracted and cleaned successfully!")

        # Find target audience (from filtered users)
        target_audience = find_target_audience(
            user_data_no_4wheeler, policy_text)
        print("Target audience identified successfully!")


        # Save results
        save_results(target_audience, output_file)
    except Exception as e:
        print(f"Error: {e}")


if __name__ == "__main__":
    fake = Faker()  # Initialize Faker object outside the loop
    main()



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


User data loaded successfully!
Filtered users without 4-wheelers.
Policy text extracted and cleaned successfully!
Error: 'Description'


In [22]:
!pip install PyPDF2
!pip install nltk

import PyPDF2
import nltk
from nltk.corpus import stopwords
import re

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def extract_text_from_pdf(pdf_file):
    with open(pdf_file, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            text += page.extract_text()
        return text

def clean_text(text):
    text = re.sub(r'\W+', ' ', text)
    text = re.sub(r'\d+', '', text)
    text = text.lower()
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

def main():
    policy_pdf = '/content/bajaj-allianz-car-insurance-policy-brochurepdf.pdf'

    try:
        # Extract policy text from PDF
        policy_text = extract_text_from_pdf(policy_pdf)
        policy_text = clean_text(policy_text)  # Clean the policy text
        print(policy_text) # Print the extracted and cleaned policy text

    except Exception as e:
        print(f"Error: {e}")

if __name__ == "__main__":
    main()



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


bajaj allianz general insurance co ltd bajaj allianz house airport road yerawada pune irda reg query toll free www bajajallianz com bagichelp bajajallianz co policy holders download insurance w allet one touch access vailable details risk factors terms conditions please read sales brochure concluding sale cin u pn plc bjaz b feb cin u pn plcbajaj allianz private car package policy ensuring iles smiles private car package policy owning car become affordable days service maintenance become expensive costs owner long run especially car damaged due unavoidable circumstances accident private car package policy designed keeping mind situations car protected need coverages section damage cover accidental loss damage car caused following fire explosion self ignition lightning burglary housebreaking theft riot strike earthquake fire shock damage flood typhoon hurricane storm tempest inundation cyclone hailstorm frost accidental external means malicious act terrorist activity whilst transit road

In [23]:
# prompt: column names

df.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y', 'Phone', 'Email', 'Address',
       'Has_4_Wheeler'],
      dtype='object')

In [26]:
def combine_columns_for_text(data, columns):
    """Combine multiple columns into a single text feature for relevance analysis."""
    return data[columns].astype(str).agg(' '.join, axis=1)


def find_target_audience(user_data, policy_text, combined_column_name='CombinedDescription'):
    # Ensure combined_column_name exists
    if combined_column_name not in user_data.columns:
        raise ValueError(f"Column '{combined_column_name}' not found in user_data.")

    # Fill NaN values with an empty string to avoid issues
    user_data[combined_column_name] = user_data[combined_column_name].fillna('')

    # Vectorize user data and policy text
    vectorizer = TfidfVectorizer()
    user_vectors = vectorizer.fit_transform(user_data[combined_column_name])
    policy_vector = vectorizer.transform([policy_text])

    # Calculate cosine similarities
    similarities = cosine_similarity(user_vectors, policy_vector).flatten()
    user_data['Relevance'] = similarities

    # Sort users by relevance
    sorted_users = user_data.sort_values(by='Relevance', ascending=False)
    return sorted_users


def main():
    user_data_csv = '/content/train.csv'
    policy_pdf = '/content/bajaj-allianz-car-insurance-policy-brochurepdf.pdf'
    output_file = '/content/target_audience.csv'

    try:
        # Load user data from CSV
        user_data = load_user_data_from_csv(user_data_csv)
        print("User data loaded successfully!")

        # Filter users who do not have a 4-wheeler
        user_data_no_4wheeler = user_data[user_data['Has_4_Wheeler'] == False]
        print("Filtered users without 4-wheelers.")

        # Combine selected columns into a single column for relevance analysis
        relevant_columns = [
            'age', 'job', 'marital', 'education', 'default', 'balance',
            'housing', 'loan', 'contact', 'day', 'month', 'duration',
            'campaign', 'pdays', 'previous', 'poutcome', 'y', 'Phone',
            'Email', 'Address', 'Has_4_Wheeler'
        ]
        user_data_no_4wheeler['CombinedDescription'] = combine_columns_for_text(user_data_no_4wheeler, relevant_columns)
        print("Combined relevant columns into 'CombinedDescription'.")

        # Extract policy text from PDF
        policy_text = extract_text_from_pdf(policy_pdf)
        policy_text = clean_text(policy_text)
        print("Policy text extracted and cleaned successfully!")

        # Find target audience using combined columns
        target_audience = find_target_audience(user_data_no_4wheeler, policy_text)
        print("Target audience identified successfully!")

        # Save results
        save_results(target_audience, output_file)
    except Exception as e:
        print(f"Error: {e}")


In [27]:
# Install required libraries if not already installed
!pip install faker  # Install faker if you haven't already
!pip install PyPDF2  # Install PyPDF2 for PDF extraction
!pip install nltk
!pip install requests
!pip install beautifulsoup4
!pip install scikit-learn

import pandas as pd
from faker import Faker
import re
import numpy as np
import PyPDF2
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))


def generate_10_digit_phone():
    """Generates a random 10-digit phone number."""
    fake = Faker()
    while True:
        phone = fake.phone_number()
        # Remove non-digit characters and check length
        cleaned_phone = re.sub(r'\D', '', phone)
        if len(cleaned_phone) == 10:
            return cleaned_phone


def has_4_wheeler(age):
    """Randomly determines if a person has a 4-wheeler based on age."""
    fake = Faker()
    if age > 18:
        return fake.boolean(chance_of_getting_true=50)  # 50% chance
    else:
        return False


def extract_text_from_pdf(pdf_file):
    """Extracts text from a PDF file."""
    with open(pdf_file, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            text += page.extract_text()
        return text


def clean_text(text):
    """Cleans text by removing non-alphanumeric characters and stop words."""
    text = re.sub(r'\W+', ' ', text)
    text = re.sub(r'\d+', '', text)
    text = text.lower()
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)


def load_user_data_from_csv(csv_file):
    """Loads user data from a CSV file and adds generated data."""
    try:
        df = pd.read_csv(csv_file)
        if not df.empty:
            fake = Faker()
            # Add new columns with fake data
            df['Phone'] = [generate_10_digit_phone() for _ in range(len(df))]
            df['Email'] = [fake.email() for _ in range(len(df))]
            df['Address'] = [fake.address() for _ in range(len(df))]
            # Add 'Has_4_Wheeler' column based on age
            df['Has_4_Wheeler'] = df['age'].apply(has_4_wheeler)
        return df
    except FileNotFoundError:
        print(f"Error: The file '{csv_file}' was not found.")
        return pd.DataFrame()


def find_target_audience(user_data, policy_text, user_text_column='Has_4_Wheeler'):
    """Finds the target audience based on cosine similarity."""
    # Convert the 'Has_4_Wheeler' column to string
    user_data[user_text_column] = user_data[user_text_column].astype(str)

    # Vectorize user data and policy text
    vectorizer = TfidfVectorizer()
    user_vectors = vectorizer.fit_transform(user_data[user_text_column])
    policy_vector = vectorizer.transform([policy_text])

    # Calculate cosine similarities
    similarities = cosine_similarity(user_vectors, policy_vector).flatten()
    user_data['Relevance'] = similarities

    # Sort users by relevance
    sorted_users = user_data.sort_values(by='Relevance', ascending=False)
    return sorted_users


def save_results(target_audience, output_file):
    """Saves the target audience to a CSV file."""
    target_audience.to_csv(output_file, index=False)
    print(f"Target audience saved to {output_file}")


def main():
    user_data_csv = '/content/train.csv'
    policy_pdf = '/content/bajaj-allianz-car-insurance-policy-brochurepdf.pdf'
    output_file = '/content/target_audience.csv'

    try:
        # Load user data from CSV
        user_data = load_user_data_from_csv(user_data_csv)
        print("User data loaded successfully!")

        # Filter users who do not have a 4-wheeler
        user_data_no_4wheeler = user_data[user_data['Has_4_Wheeler'] == False]
        print("Filtered users without 4-wheelers.")

        # Extract policy text from PDF
        policy_text = extract_text_from_pdf(policy_pdf)
        policy_text = clean_text(policy_text)
        print("Policy text extracted and cleaned successfully!")

        # Find target audience using 'Has_4_Wheeler' column
        target_audience = find_target_audience(user_data_no_4wheeler, policy_text, user_text_column='Has_4_Wheeler')
        print("Target audience identified successfully!")

        # Save results
        save_results(target_audience, output_file)
    except Exception as e:
        print(f"Error: {e}")


if __name__ == "__main__":
    main()




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


User data loaded successfully!
Filtered users without 4-wheelers.
Policy text extracted and cleaned successfully!
Target audience identified successfully!


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_data[user_text_column] = user_data[user_text_column].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_data['Relevance'] = similarities


Target audience saved to /content/target_audience.csv


In [28]:
# Install required libraries if not already installed
!pip install faker
!pip install PyPDF2
!pip install nltk
!pip install scikit-learn

import pandas as pd
from faker import Faker
import re
import numpy as np
import PyPDF2
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))


def generate_10_digit_phone():
    """Generates a random 10-digit phone number."""
    fake = Faker()
    while True:
        phone = fake.phone_number()
        cleaned_phone = re.sub(r'\D', '', phone)
        if len(cleaned_phone) == 10:
            return cleaned_phone


def extract_text_from_pdf(pdf_file):
    """Extracts text from a PDF file."""
    with open(pdf_file, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            text += page.extract_text()
        return text


def clean_text(text):
    """Cleans text by removing non-alphanumeric characters and stop words."""
    text = re.sub(r'\W+', ' ', text)
    text = re.sub(r'\d+', '', text)
    text = text.lower()
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)


def load_user_data_from_csv(csv_file):
    """Loads user data from a CSV file and adds generated data."""
    try:
        df = pd.read_csv(csv_file)
        if not df.empty:
            fake = Faker()
            # Add new columns with fake data
            df['Phone'] = [generate_10_digit_phone() for _ in range(len(df))]
            df['Email'] = [fake.email() for _ in range(len(df))]
            df['Address'] = [fake.address() for _ in range(len(df))]
        return df
    except FileNotFoundError:
        print(f"Error: The file '{csv_file}' was not found.")
        return pd.DataFrame()


def find_target_audience(user_data, policy_text, user_text_column='housing'):
    """Finds the target audience based on cosine similarity."""
    # Convert the 'housing' column to string
    user_data[user_text_column] = user_data[user_text_column].astype(str)

    # Vectorize user data and policy text
    vectorizer = TfidfVectorizer()
    user_vectors = vectorizer.fit_transform(user_data[user_text_column])
    policy_vector = vectorizer.transform([policy_text])

    # Calculate cosine similarities
    similarities = cosine_similarity(user_vectors, policy_vector).flatten()
    user_data['Relevance'] = similarities

    # Sort users by relevance
    sorted_users = user_data.sort_values(by='Relevance', ascending=False)
    return sorted_users


def save_results(target_audience, output_file):
    """Saves the target audience to a CSV file."""
    target_audience.to_csv(output_file, index=False)
    print(f"Target audience saved to {output_file}")


def main():
    user_data_csv = '/content/train.csv'
    policy_pdf = '/content/home-loan-policy.pdf'
    output_file = '/content/target_audience_home_loan.csv'

    try:
        # Load user data from CSV
        user_data = load_user_data_from_csv(user_data_csv)
        print("User data loaded successfully!")

        # Filter users who do not have a housing loan
        user_data_no_housing_loan = user_data[user_data['housing'] == 'no']
        print("Filtered users without a housing loan.")

        # Extract policy text from PDF
        policy_text = extract_text_from_pdf(policy_pdf)
        policy_text = clean_text(policy_text)
        print("Policy text extracted and cleaned successfully!")

        # Find target audience using 'housing' column
        target_audience = find_target_audience(user_data_no_housing_loan, policy_text, user_text_column='housing')
        print("Target audience identified successfully!")

        # Save results
        save_results(target_audience, output_file)
    except Exception as e:
        print(f"Error: {e}")


if __name__ == "__main__":
    main()




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


User data loaded successfully!
Filtered users without a housing loan.
Error: [Errno 2] No such file or directory: '/content/home-loan-policy.pdf'
