<h1>Scraping data</h1>

Importing required libraraies

In [60]:
import pandas as pd
import requests
import re
import os

Reading the input file 

In [61]:
df = pd.read_csv('News_Articles_Indian_Express.csv')
df.head(2)

Unnamed: 0.1,Unnamed: 0,article_id,headline,desc,date,url,articles,article_type,article_length
0,0,INDEXP20000,"Trainer aircraft crashes in Odisha, 2 killed",The two were taken to a nearby hospital in Kam...,"June 8, 2020 11:06:53 am",https://indianexpress.com/article/india/traine...,A two-seater aircraft crashed in Odisha's Dhen...,short,753
1,1,INDEXP19999,Uttarkhand unlock 1.0: Hotel bookings for mini...,All hotels/ B&B/ Homestay & hospitality servic...,"June 8, 2020 11:03:16 am",https://indianexpress.com/article/india/touris...,Hotels located in non-containment zones can re...,long,2424


In [62]:
df.columns

Index(['Unnamed: 0', 'article_id', 'headline', 'desc', 'date', 'url',
       'articles', 'article_type', 'article_length'],
      dtype='object')

Extracting Image URLs from the webpage URL

In [63]:

def extract_image_url_from_url(article_url):
    '''This function returns the Image URL'''
    try:
        # Fetch the HTML content of the article URL
        response = requests.get(article_url, timeout = 10)
        response.raise_for_status()  # Raise an exception for bad status codes
        html_content = response.text
        
        # Find all <img> tags in the HTML content with class containing 'wp-image-'
        img_tags = re.findall(r'<img.*?class=".*?wp-image-\d+.*?".*?src="(.*?)"', html_content)
        
        # Return the first image URL if found, otherwise return None
        return img_tags[0] if img_tags else None
    except Exception as e:
        print(f"Error occurred while extracting image URL from {article_url}: {e}")
        return None


In [64]:
def process_and_save_in_batches(df, batch_size):
    '''This function adds a new column to the existing csv in which we store the URL of the image. The task is performed in batches.'''
    if 'image_url' not in df.columns:
        df['image_url'] = None  
    num_batches = len(df) // batch_size + 1
    for batch_num in range(num_batches):
        updated_csv_filename = f'data\\updated_csv_batch_{batch_num}.csv'
        start_index = batch_num * batch_size
        end_index = min((batch_num + 1) * batch_size, len(df))
        
        # Check if the updated CSV file already exists
        if os.path.exists(updated_csv_filename):
            # Read the existing CSV file and update the 'image_url' column
            existing_df = pd.read_csv(updated_csv_filename)
            existing_df['image_url'] = existing_df.apply(lambda row: extract_image_url_from_url(row['url']) if pd.isna(row['image_url']) else row['image_url'], axis=1)
            existing_df.to_csv(updated_csv_filename, index=False)
            print(f"Batch {batch_num + 1} added to existing CSV file: {updated_csv_filename}")
        else:
            batch_df = df[start_index:end_index].copy()  # Create a copy of the batch DataFrame
            batch_df['image_url'] = batch_df.apply(lambda row: extract_image_url_from_url(row['url']) if pd.isna(row['image_url']) else row['image_url'], axis=1)
            # Save the updated DataFrame as a new CSV file
            batch_df.to_csv(updated_csv_filename, index=False)
            print(f"Batch {batch_num + 1} processed and saved to {updated_csv_filename}")


In [68]:
# Process and save DataFrame in batches of size 100
batch_size = 100
process_and_save_in_batches(df, batch_size)

Batch 1 added to existing CSV file: data\updated_csv_batch_0.csv
Batch 2 added to existing CSV file: data\updated_csv_batch_1.csv
Batch 3 added to existing CSV file: data\updated_csv_batch_2.csv
Batch 4 added to existing CSV file: data\updated_csv_batch_3.csv
Batch 5 added to existing CSV file: data\updated_csv_batch_4.csv
Batch 6 added to existing CSV file: data\updated_csv_batch_5.csv
Batch 7 added to existing CSV file: data\updated_csv_batch_6.csv
Batch 8 added to existing CSV file: data\updated_csv_batch_7.csv
Batch 9 added to existing CSV file: data\updated_csv_batch_8.csv
Batch 10 added to existing CSV file: data\updated_csv_batch_9.csv
Batch 11 added to existing CSV file: data\updated_csv_batch_10.csv
Batch 12 added to existing CSV file: data\updated_csv_batch_11.csv
Batch 13 added to existing CSV file: data\updated_csv_batch_12.csv
Batch 14 added to existing CSV file: data\updated_csv_batch_13.csv
Batch 15 added to existing CSV file: data\updated_csv_batch_14.csv
Batch 16 added

Saving Images to a new folder

In [17]:
def save_images_from_csvs(csv_folder):
    # Create the 'images' folder if it doesn't exist
    os.makedirs('images', exist_ok=True)
    
    # Iterate through each CSV file in the specified folder
    for filename in os.listdir(csv_folder):
        if filename.startswith('updated_csv_batch_') and filename.endswith('.csv'):
            filepath = os.path.join(csv_folder, filename)
            print(filepath)
            df = pd.read_csv(filepath)
            for index, row in df.iterrows():
                image_url = row['image_url']
                article_id = row['article_id']
                if pd.notnull(image_url):
                    image_filename = os.path.join('images', f"{article_id}.jpg")
                    # Check if image with the same name already exists in the 'images' folder
                    if not os.path.exists(image_filename):
                        try:
                            image_data = requests.get(image_url).content
                            with open(image_filename, 'wb') as f:
                                f.write(image_data)
                            print(f"Image saved: {image_filename}")
                        except Exception as e:
                            print(f"Error occurred while downloading image for article {article_id}: {e}")
                    # else:
                    #     print(f"Image for article {article_id} already exists, skipping download")




In [19]:
# Call the function with the folder containing the CSV files
csv_folder = 'data2'  # Update this with the path to your CSV folder
save_images_from_csvs(csv_folder)

data2\updated_csv_batch_0.csv
data2\updated_csv_batch_1.csv
data2\updated_csv_batch_10.csv
data2\updated_csv_batch_100.csv
data2\updated_csv_batch_101.csv
data2\updated_csv_batch_102.csv
data2\updated_csv_batch_103.csv
data2\updated_csv_batch_104.csv
data2\updated_csv_batch_105.csv
data2\updated_csv_batch_106.csv
data2\updated_csv_batch_107.csv
data2\updated_csv_batch_108.csv
data2\updated_csv_batch_109.csv
data2\updated_csv_batch_11.csv
data2\updated_csv_batch_110.csv
data2\updated_csv_batch_111.csv
data2\updated_csv_batch_112.csv
data2\updated_csv_batch_113.csv
data2\updated_csv_batch_114.csv
data2\updated_csv_batch_115.csv
data2\updated_csv_batch_116.csv
data2\updated_csv_batch_117.csv
data2\updated_csv_batch_118.csv
data2\updated_csv_batch_119.csv
data2\updated_csv_batch_12.csv
data2\updated_csv_batch_120.csv
data2\updated_csv_batch_121.csv
data2\updated_csv_batch_122.csv
data2\updated_csv_batch_123.csv
data2\updated_csv_batch_124.csv
data2\updated_csv_batch_125.csv
data2\updated_c

Counting the number of images left

In [57]:
def count_null_image_urls(csv_folder, count = 0):
    '''This function checks in how many csvs the image url is not added.'''
    
    for filename in os.listdir(csv_folder):
        if filename.startswith('updated_csv_batch_') and filename.endswith('.csv'):
            filepath = os.path.join(csv_folder, filename)
            df = pd.read_csv(filepath)
            null_count = df['image_url'].isnull().sum()
            # if null_count > 0 :
            #     print(f"Number of null values in {filename}: {null_count}")
            # else:
            count+=null_count
    print(f"Non zeros are {count}" )




In [77]:
# Call the function with the folder containing the CSV files
csv_folder = 'data'  # Update this with the path to your CSV folder
count_null_image_urls(csv_folder)

Non zeros are 6531


In [59]:
#  Call the function with the folder containing the CSV files
csv_folder = 'data2'  # Update this with the path to your CSV folder
count_null_image_urls(csv_folder)

Non zeros are 5838


Combing all the csv into a single one

In [None]:
folder_path = 'data'

# Get a list of all CSV files in the folder
csv_files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]

dfs = []

# Iterate through each CSV file and read it into a DataFrame
for file in csv_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path)
    dfs.append(df)

# Concatenate all DataFrames into one
combined_df = pd.concat(dfs, ignore_index=True)

# Path to save the combined CSV file
combined_csv_path = 'modified.csv'

# Write the combined DataFrame to a CSV file
combined_df.to_csv(combined_csv_path, index=False)

print("Combined CSV file created successfully!")


In [None]:
def remove_columns_from_csv(file_path):
    '''This function removes columns from the csvs.'''
    df = pd.read_csv(file_path)
    
    # Drop specified columns
    columns_to_remove = ['date', 'articles', 'article_type', 'article_length']
    df = df.drop(columns_to_remove, axis=1)
    
    # Save the modified DataFrame back to CSV
    df.to_csv(file_path, index=False)

# Path to the folder containing CSV files
folder_path = 'data'

# Iterate over each file in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith('.csv'):
        file_path = os.path.join(folder_path, file_name)
        remove_columns_from_csv(file_path)

Creating a new dataframe which consists of images left after hand-filtering. Iterating through all the csv and checking if any image with file name as the article id exist or not . If exist, append its value in he dataframe otherwise not.

In [78]:
# Path to the folder containing CSV files
data_folder = 'data'

# Path to the folder containing images
image_folder = 'original_image'

# Initialize a list to store the extracted data
data_list = []

# Path to the new CSV file
new_csv_filename = 'extracted_data.csv'

# Check if the new CSV file already exists
if os.path.exists(new_csv_filename):
    # Read the existing CSV file into a DataFrame
    existing_df = pd.read_csv(new_csv_filename)
else:
    # If the new CSV file does not exist, create an empty DataFrame
    existing_df = pd.DataFrame(columns=['article_id', 'url', 'image_url', 'headline', 'desc'])

# Iterate through the CSV files in the 'data' folder
for csv_filename in os.listdir('data'):
    if csv_filename.endswith('.csv'):
        # Read the CSV file
        csv_filepath = os.path.join(data_folder, csv_filename)
        df = pd.read_csv(csv_filepath)

        # Iterate through each row in the DataFrame
        for index, row in df.iterrows():
            article_id = row['article_id']

            # Check if the image file exists
            image_filename = f"{article_id}.jpg"
            image_filepath = os.path.join(image_folder, image_filename)
            if os.path.exists(image_filepath):
                # Check if the article_id already exists in the existing DataFrame
                if article_id in existing_df['article_id'].values:
                    continue  # Skip if article_id already exists in the DataFrame

                # Extract the values of 'url', 'image_url', 'headline', and 'desc'
                url = row['url']
                image_url = row['image_url']
                headline = row['headline']
                desc = row['desc']

                # Append the extracted data to the list
                data_list.append({'article_id': article_id, 'url': url, 'image_url': image_url, 'headline': headline, 'desc': desc})

# Create a new DataFrame from the extracted data list
new_df = pd.DataFrame(data_list)

# Concatenate the existing DataFrame and the new DataFrame
existing_df = pd.concat([existing_df, new_df], ignore_index=True)

# Save the concatenated DataFrame back to the CSV file
existing_df.to_csv(new_csv_filename, index=False)

print(f"Data extracted and saved to {new_csv_filename}")


Data extracted and saved to extracted_data.csv


Resized the images to a fixed size

In [None]:
from PIL import Image

def resize_images(input_dir, output_dir, target_size=(256, 256)):
    '''This function resizes all the images to a target size.'''
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for filename in os.listdir(input_dir):
        if filename.endswith(('.jpg', '.jpeg', '.png', '.bmp', '.gif')):
            input_path = os.path.join(input_dir, filename)
            output_path = os.path.join(output_dir, filename)
            img = Image.open(input_path)
            img = img.resize(target_size, Image.ANTIALIAS)
            img.save(output_path)

# Example usage:
input_dir = 'trial'
output_dir = 'resized_trial'
resize_images(input_dir, output_dir)


  img = img.resize(target_size, Image.ANTIALIAS)


In [76]:
df_new = pd.read_csv('extracted_data.csv')
df_new.shape

(75, 5)

In [None]:
df_new.columns

Index(['article_id', 'url', 'image_url', 'headline', 'desc'], dtype='object')

<h1>Headline Manipulation : Fake Headline by inverting semantics</h1>

In [81]:
positive_to_negative = {

    'open':'close',
    'close':'open',
    'arrest':'release',
    'against': 'for',
    'good': 'bad',
    'great': 'disastrous',
    'positive': 'negative',
    'success': 'failure',
    'victory': 'defeat',
    'progress': 'regression',
    'win': 'loss',
    'heroic': 'villainous',
    'honor': 'dishonor',
    'uplifting': 'demoralizing',
    'hopeful': 'dismal',
    'inspiring': 'discouraging',
    'prosperous': 'ailing',
    'triumph': 'suffering',
    'benevolent': 'malevolent',
    'ethical': 'unethical',
    'righteous': 'corrupt',
    'virtuous': 'immoral',
    'beneficial': 'harmful',
    'savior': 'tyrant',
    'leader': 'dictator',
    'freedom': 'oppression',
    'liberty': 'tyranny',
    'justice': 'injustice',
    'fairness': 'unfairness',
    'equality': 'inequality',
    'peaceful': 'turbulent',
    'stable': 'unstable',
    'strong': 'weak',
    'powerful': 'powerless',
    'confident': 'doubtful',
    'optimistic': 'pessimistic',
    'bright': 'bleak',
    'radiant': 'gloomy',
    'hope': 'despair',
    'support': 'opposition',
    'unity': 'division',
    'collaboration': 'conflict',
    'harmony': 'discord',
    'cooperation': 'dissension',
    'agreement': 'disagreement',
    'consensus': 'disunity',
    'solidarity': 'disunity',
    'trust': 'distrust',
    'credibility': 'doubt',
    'reliable': 'unreliable',
    'credible': 'incredible',
    'transparent': 'opaque',
    'openness': 'secrecy',
    'accountability': 'unaccountability',
    'responsibility': 'irresponsibility',
    'honesty': 'dishonesty',
    'integrity': 'corruption',
    'ethical': 'unethical',
    'upright': 'crooked',
    'humble': 'arrogant',
    'modest': 'immodest',
    'selfless': 'selfish',
    'sacrifice': 'selfishness',
    'compassion': 'indifference',
    'empathy': 'apathy',
    'charitable': 'stingy',
    'philanthropic': 'greedy',
    'generous': 'selfish',
    'altruistic': 'self-serving',
    'noble': 'ignoble',
    'honorable': 'dishonorable',
    'prestigious': 'disreputable',
    'dignified': 'undignified',
    'respectable': 'disreputable',
    'esteemed': 'disreputable',
    'admired': 'disliked',
    'beloved': 'hated',
    'cherished': 'despised',
    'hero': 'villain',
    'icon': 'pariah',
    'idol': 'scoundrel',
    'legend': 'infamous',
    'role model': 'example of what not to do',
    'inspiration': 'source of dismay',
    'idolized': 'vilified',
    'worshiped': 'reviled',
    'glorified': 'condemned',
    'exemplary': 'reprehensible',
    'paragon': 'pariah',
    'pioneer': 'renegade',
    'visionary': 'heretic',
    'trailblazer': 'outcast',
    'maverick': 'rebel',
    'innovative': 'subversive',
    'revolutionary': 'counter-revolutionary',
    'groundbreaking': 'radical',
    'bold': 'reckless',
    'courageous': 'cowardly',
    'brave': 'fearful',
    'fearless': 'timid',
    'daring': 'cautious',
    'audacious': 'timid',
    'gallant': 'pusillanimous',
    'valiant': 'pusillanimous',
    'heroic': 'cowardly',
    'intrepid': 'fearful',
    'bold': 'faint-hearted',
    'venturesome': 'timid',
    'adventurous': 'unadventurous',
    'risk-taking': 'risk-averse',
    'admirable': 'pitiful',
    'laudable': 'pitiable',
    'praiseworthy': 'contemptible',
    'commendable': 'despicable',
    'noteworthy': 'deplorable',
    'applauded': 'berated',
    'applause': 'criticism',
    'disappointed':'proud',
    'acclaimed': 'condemned',
    'celebrated': 'maligned',
    'acknowledged': 'disregarded',
    'recognized': 'ignored',
    'eminent': 'obscure',
    'renowned': 'unknown',
    'famous': 'obscure',
    'illustrious': 'disreputable',
    'respected': 'disrespected',
    'honored': 'dishonored',
    'esteemed': 'disesteemed',
    'adored': 'reviled',
    'revered': 'vilified',
    'venerated': 'condemned',
    'glorified': 'abhorred',
    'lowly': 'highly',
    'poorly regarded': 'well-regarded',
    'loss':'gain', 
    'gain':'loss',
    'poorly': 'highly',
    'protest': 'celebrate',
    'dis':'',
    'for':'against',
    'praise':'criticise',
    'happy':'sad',
    'sad':'happy',
    'proud':'disappointed',
    'small':'big',
    'big':'small',
}



In [82]:
negative_to_positive = {


    'close':'open',
    'open':'close',
    'arrest':'release',
    'release':'arrest',
    'against': 'for',
    'bad': 'good',
    'disastrous': 'great',
    'negative': 'positive',
    'failure': 'success',
    'defeat': 'victory',
    'regression': 'progress',
    'loss': 'win',
    'villainous': 'heroic',
    'dishonor': 'honor',
    'demoralizing': 'uplifting',
    'dismal': 'hopeful',
    'discouraging': 'inspiring',
    'ailing': 'prosperous',
    'suffering': 'triumph',
    'malevolent': 'benevolent',
    'unethical': 'ethical',
    'corrupt': 'righteous',
    'immoral': 'virtuous',
    'harmful': 'beneficial',
    'tyrant': 'savior',
    'dictator': 'leader',
    'oppression': 'freedom',
    'tyranny': 'liberty',
    'injustice': 'justice',
    'unfairness': 'fairness',
    'inequality': 'equality',
    'turbulent': 'peaceful',
    'unstable': 'stable',
    'weak': 'strong',
    'powerless': 'powerful',
    'doubtful': 'confident',
    'pessimistic': 'optimistic',
    'bleak': 'bright',
    'gloomy': 'radiant',
    'despair': 'hope',
    'opposition': 'support',
    'division': 'unity',
    'conflict': 'collaboration',
    'discord': 'harmony',
    'dissension': 'cooperation',
    'disagreement': 'agreement',
    'disunity': 'consensus',
    'distrust': 'trust',
    'doubt': 'credibility',
    'unreliable': 'reliable',
    'incredible': 'credible',
    'opaque': 'transparent',
    'secrecy': 'openness',
    'unaccountability': 'accountability',
    'irresponsibility': 'responsibility',
    'dishonesty': 'honesty',
    'corruption': 'integrity',
    'unethical': 'ethical',
    'crooked': 'upright',
    'arrogant': 'humble',
    'immodest': 'modest',
    'selfishness': 'sacrifice',
    'selfish': 'selfless',
    'indifference': 'compassion',
    'apathy': 'empathy',
    'stingy': 'charitable',
    'greedy': 'philanthropic',
    'selfish': 'generous',
    'self-serving': 'altruistic',
    'ignoble': 'noble',
    'dishonorable': 'honorable',
    'disreputable': 'prestigious',
    'undignified': 'dignified',
    'dislike': 'admired',
    'hate': 'belove',
    'despised': 'cherished',
    'villain': 'hero',
    'pariah': 'icon',
    'scoundrel': 'idol',
    'infamous': 'legend',
    'example of what not to do': 'role model',
    'source of dismay': 'inspiration',
    'vilified': 'idolized',
    'reviled': 'worshiped',
    'condemned': 'glorified',
    'reprehensible': 'exemplary',
    'pariah': 'paragon',
    'renegade': 'pioneer',
    'heretic': 'visionary',
    'outcast': 'trailblazer',
    'rebel': 'maverick',
    'subversive': 'innovative',
    'counter-revolutionary': 'revolutionary',
    'radical': 'groundbreaking',
    'reckless': 'bold',
    'cowardly': 'courageous',
    'fearful': 'brave',
    'timid': 'fearless',
    'cautious': 'daring',
    'timid': 'audacious',
    'pusillanimous': 'gallant',
    'pusillanimous': 'valiant',
    'cowardly': 'heroic',
    'fearful': 'intrepid',
    'faint-hearted': 'bold',
    'timid': 'venturesome',
    'unadventurous': 'adventurous',
    'risk-averse': 'risk-taking',
    'pitiful': 'admirable',
    'pitiable': 'laudable',
    'contemptible': 'praiseworthy',
    'despicable': 'commendable',
    'deplorable': 'noteworthy',
    'berated': 'applauded',
    'criticism': 'applause',
    'proud': 'disappointed',
    'condemn': 'acclaim',
    'maligne': 'celebrate',
    'disregarde': 'acknowledge',
    'ignore': 'recognize',
    'obscure': 'eminent',
    'unknown': 'renowned',
    'obscure': 'famous',
    'disreputable': 'illustrious',
    'disrespected': 'respected',
    'dishonored': 'honored',
    'disesteemed': 'esteemed',
    'reviled': 'adored',
    'vilified': 'revered',
    'condemned': 'venerated',
    'abhorred': 'glorified',
    'highly': 'lowly',
    'well-regarded': 'poorly regarded',
    'highly': 'poorly',
    'celebrate': 'protest',
    'for': 'against',
    'criticise': 'praise',
    'sad': 'happy',
    'happy': 'sad',
    'disappointed': 'proud',
    'big': 'small',
    'small': 'big',
    'loss':'gain',
    'gain':'loss',
    'not ':' ', 
}


In [83]:
def invert_sentence(sentence, positive_to_negative):
    words = sentence.split()
    inverted_sentence = []
    for word in words:
        # Check if any part of the word is in the positive_to_negative dictionary
        found = False
        for pos_word, neg_word in positive_to_negative.items():
            if pos_word in word.lower():
                inverted_sentence.append(word.lower().replace(pos_word, neg_word))
                found = True
                break
        if not found:
            inverted_sentence.append(word)
    return ' '.join(inverted_sentence)



In [84]:
# Function to create fake headlines
def create_fake_headline(headline):
    fake_headline = invert_sentence(headline, positive_to_negative)
    return fake_headline

In [86]:
pip install flair

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1 -> 24.0
[notice] To update, run: C:\Users\Shubhi Agarwal\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [88]:
from flair.models import TextClassifier
from flair.data import Sentence

def get_sentiment(description):
    '''This function returns the sentiment of a statement.'''
    classifier = TextClassifier.load('en-sentiment')
    flair_sentence = Sentence(description)
    classifier.predict(flair_sentence)
    label = flair_sentence.labels[0].value
    return label


In [89]:
sentence = "I love this product! It's amazing."
sentiment= get_sentiment(sentence)
print("Sentiment:", sentiment)

In [None]:
# Output will be a modified sentence
orig_statement = 'People are protesting against the new law'
print("Orig:", orig_statement)
print(create_fake_headline(orig_statement)) 

Orig: People are protesting against the new law
People are celebrateing for the new law


In [109]:
# Load the CSV file
csv_filename = 'extracted_data.csv'
df_ed = pd.read_csv(csv_filename)

In [115]:
descriptions = df_ed['desc']

# Analyze sentiment and add a new column 'label'
df_ed['label'] = descriptions.apply(lambda x: get_sentiment(x))

# Save the modified DataFrame back to the CSV file
df_ed.to_csv('extracted_data.csv', index=False)

In [114]:
df_ed.columns

Index(['article_id', 'url', 'image_url', 'headline', 'desc', 'label',
       'modified_title'],
      dtype='object')

In [97]:
def modify_title(row):
    '''This function applies the invert_sentence based on the label'''
    sentiment = row['label']
    title = row['headline']
    if sentiment in ['POSITIVE', 'NEUTRAL']:
        return invert_sentence(title, positive_to_negative)
    else:
        return invert_sentence(title, negative_to_positive)

In [112]:
df_ed['modified_title'] = df_ed.apply(modify_title, axis=1)
# Save the modified DataFrame back to the CSV file
df_ed.to_csv('extracted_data.csv', index=False)

In [116]:
# Apply the function to create fake headlines and store in 'Fake_headline' column
df_ed['Fake_headline'] = df_ed['headline'].apply(create_fake_headline)

# Save the modified DataFrame to a new CSV file
csv_filename = 'extracted_data.csv'
df_ed.to_csv(new_csv_filename, index=False)

print(f"Fake headlines created and saved to {csv_filename}")


Fake headlines created and saved to extracted_data.csv


In [127]:
# Read the two CSV files
extracted_data = pd.read_csv('extracted_data.csv')
news_article = pd.read_csv('News_Articles_Indian_Express.csv')
extracted_data.columns

Index(['article_id', 'url_x', 'image_url_x', 'headline_x', 'desc_x', 'label_x',
       'modified_title', 'Fake_headline_x', 'Unnamed: 0', 'headline_y',
       'desc_y', 'date', 'url_y', 'articles', 'article_type', 'article_length',
       'label_y', 'image_url_y', 'headline', 'Fake_headline_y', 'desc', 'url'],
      dtype='object')

In [123]:
news_article.columns

Index(['Unnamed: 0', 'article_id', 'headline', 'desc', 'date', 'url',
       'articles', 'article_type', 'article_length'],
      dtype='object')

In [125]:
# Merge the two DataFrames on the 'article_id' column, for common columns
common_columns = pd.merge(extracted_data, news_article, on='article_id', how='left')

# Select specific columns from each DataFrame
specific_columns_left = extracted_data[['article_id', 'label', 'image_url', 'headline', 'Fake_headline']]
specific_columns_right = news_article[['article_id', 'desc', 'date', 'url', 'articles', 'article_length']]

# Merge the specific columns from each DataFrame with the common columns DataFrame
merged_df = pd.merge(common_columns, specific_columns_left, on='article_id', how='left')
merged_df[['desc', 'date', 'url', 'articles', 'article_length']] = specific_columns_right[['desc', 'date', 'url', 'articles', 'article_length']]

# Save the merged DataFrame to a new CSV file
merged_df.to_csv('extracted_data.csv', index=False)


In [133]:
extracted_data.drop(columns = ['headline_x', 'headline_y','desc_x','url_x', 'url_y','Unnamed: 0', 'image_url_x', 'label_x' , 'Fake_headline_x', 'Fake_headline_y'], inplace = True)

In [134]:

extracted_data.rename(columns = {'image_url_y':'image_url','label_y':'label', 'desc_y':'description' }, inplace = True)

In [135]:
extracted_data.columns

Index(['article_id', 'modified_title', 'description', 'date', 'articles',
       'article_type', 'article_length', 'label', 'image_url', 'headline',
       'desc', 'url'],
      dtype='object')