## **Data Cleaning**

In [3]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('/content/labeled_financial_news_2M.csv')

# Remove rows with any missing values
df_clean = df.dropna()

# Remove duplicate rows from the DataFrame
df_clean = df_clean.drop_duplicates()

# Save the cleaned DataFrame to a new CSV file
clean_csv_path = '/content/labeled_financial_news_2M_clean.csv'
df_clean.to_csv(clean_csv_path, index=False)

print("Cleaned CSV file saved at:", clean_csv_path)

  df = pd.read_csv('/content/labeled_financial_news_2M.csv')


Cleaned CSV file saved at: /content/labeled_financial_news_2M_clean.csv


**Now Test the updated CSV and make sure no missing values or duplicate rows**

In [2]:
import pandas as pd

def check_csv_cleanliness(file_path):
    # Load the CSV file
    try:
        df = pd.read_csv(file_path)
        print("CSV loaded successfully!")
    except Exception as e:
        print("Error loading CSV:", e)
        return

    # Display the first few rows of the DataFrame
    print("\nFirst 5 rows of the DataFrame:")
    print(df.head())

    # Print DataFrame info: data types and non-null counts
    print("\nDataFrame Info:")
    df.info()

    # Display summary statistics for numerical (and object) columns
    print("\nSummary Statistics:")
    print(df.describe(include='all'))

    # Check for missing values
    print("\nMissing Values per Column:")
    missing = df.isna().sum()
    print(missing)
    total_missing = missing.sum()
    print("Total missing values in the DataFrame:", total_missing)

    # Check for duplicate rows
    duplicate_count = df.duplicated().sum()
    print("\nNumber of duplicate rows:", duplicate_count)

    # Additional checks (if needed, you can add more diagnostics)
    # For now, we'll consider the CSV "clean" if there are no missing values and no duplicate rows.
    if total_missing == 0 and duplicate_count == 0:
        print("\nOverall: The CSV file appears to be clean!")
    else:
        print("\nOverall: The CSV file may require cleaning:")
        if total_missing > 0:
            print("- There are missing values that may need to be handled.")
        if duplicate_count > 0:
            print("- There are duplicate rows that may need to be removed.")

# Specify the path to your CSV file
csv_path = '/content/labeled_financial_news_2M_clean.csv'

# Run the cleanliness check
check_csv_cleanliness(csv_path)


CSV loaded successfully!

First 5 rows of the DataFrame:
                      date    year  \
0  2020-06-05 06:30:54 UTC  2020.0   
1  2020-06-03 06:45:20 UTC  2020.0   
2  2020-05-26 00:30:07 UTC  2020.0   
3  2020-05-22 08:45:06 UTC  2020.0   
4  2020-05-22 07:38:59 UTC  2020.0   

                                               title sentiment  \
0            Stocks That Hit 52-Week Highs On Friday   neutral   
1         Stocks That Hit 52-Week Highs On Wednesday   neutral   
2                      71 Biggest Movers From Friday   neutral   
3       46 Stocks Moving In Friday's Mid-Day Session   neutral   
4  B of A Securities Maintains Neutral on Agilent...   bullish   

           publisher stock_symbol  \
0  Benzinga Insights            A   
1  Benzinga Insights            A   
2         Lisa Levin            A   
3         Lisa Levin            A   
4         Vick Meyer            A   

                                                 url  
0  https://www.benzinga.com/news/20/06/

# **List all the stock tickers in Dataset**

In [3]:
import pandas as pd

# Read your CSV (replace with the correct path)
df = pd.read_csv('/content/labeled_financial_news_2M_clean.csv')

# Extract unique symbols
unique_symbols = df['stock_symbol'].unique()

print("Number of unique symbols:", len(unique_symbols))
# Print each symbol on its own line
for sym in unique_symbols:
    print(sym)



Number of unique symbols: 6204
A
AA
AAC
AADR
AAL
AAMC
AAME
AAN
AAOI
AAON
AAP
AAPL
AAU
AAV
AAVL
AAWW
AAXJ
AB
ABAC
ABAX
ABB
ABBV
ABC
ABCB
ABCD
ABCO
ABCW
ABDC
ABEV
ABG
ABGB
ABIO
ABM
ABMD
ABR
ABTL
ABX
ABY
ACAD
ACAS
ACAT
ACC
ACCO
ACCU
ACE
ACET
ACFC
ACFN
ACG
ACGL
ACH
ACHC
ACHN
ACIW
ACLS
ACM
ACMP
ACN
ACNB
ACOR
ACP
ACPW
ACRE
ACRX
ACSF
ACST
ACT
ACTA
ACTG
ACTS
ACU
ACUR
ACWI
ACWV
ACWX
ACXM
ACY
ADAT
ADBE
ADC
ADEP
ADES
ADGE
ADHD
ADI
ADK
ADM
ADMA
ADMP
ADMS
ADNC
ADP
ADPT
ADRA
ADRD
ADRE
ADRU
ADS
ADSK
ADT
ADTN
ADVS
ADX
ADXS
ADZ
AE
AEB
AEC
AEE
AEG
AEGN
AEGR
AEHR
AEIS
AEL
AEM
AEO
AEPI
AER
AERI
AES
AET
AETI
AEY
AEZS
AF
AFA
AFAM
AFB
AFC
AFCB
AFFX
AFG
AFH
AFK
AFL
AFMD
AFOP
AFSI
AFT
AG
AGA
AGC
AGCO
AGD
AGEN
AGF
AGG
AGII
AGIO
AGM
AGN
AGNC
AGNCB
AGND
AGO
AGOL
AGQ
AGRO
AGRX
AGTC
AGU
AGX
AGYS
AHC
AHGP
AHH
AHL
AHP
AHPI
AHT
AI
AIA
AIB
AIF
AIG
AIMC
AIN
AINC
AINV
AIQ
AIR
AIRM
AIRT
AIT
AIV
AIXG
AIZ
AJG
AKAM
AKAO
AKBA
AKER
AKG
AKP
AKR
AKRX
AKS
AL
ALB
ALCO
ALD
ALDR
ALDW
ALDX
ALE
ALEX
ALFA
ALG
ALGN
ALGT
ALIM
ALJ
ALK
ALK