In [1]:
import pandas as pd

# Load the data into dataframes
notes = pd.read_csv("data/notes-00000.tsv", sep="\t")
ratings = pd.read_csv("data/ratings-00000.tsv", sep="\t")

# Print the first few rows of each dataframe
print("Notes dataframe:")
print(notes.head())
print("\nRatings dataframe:")
print(ratings.head())




Notes dataframe:
                noteId                            noteAuthorParticipantId  \
0  1537142913737428992  E786F29F77E52D959A2604701A9A4F491B0BA83687E515...   
1  1537145358521839617  E786F29F77E52D959A2604701A9A4F491B0BA83687E515...   
2  1537147343715282945  E786F29F77E52D959A2604701A9A4F491B0BA83687E515...   
3  1537204430730211328  E786F29F77E52D959A2604701A9A4F491B0BA83687E515...   
4  1540422295029551104  E786F29F77E52D959A2604701A9A4F491B0BA83687E515...   

   createdAtMillis              tweetId  \
0    1655318404027  1377030478167937024   
1    1655318986910  1536848327979016193   
2    1655319460217  1537080831751102467   
3    1655333070821  1537196168953974784   
4    1656100269455  1540087463099736065   

                          classification          believable  \
0  MISINFORMED_OR_POTENTIALLY_MISLEADING  BELIEVABLE_BY_MANY   
1                         NOT_MISLEADING                 NaN   
2  MISINFORMED_OR_POTENTIALLY_MISLEADING  BELIEVABLE_BY_MANY   
3  MI

In [2]:
# Get summary statistics for each dataframe
print("Notes summary statistics:")
print(notes.describe())
print("\nRatings summary statistics:")
print(ratings.describe())



Notes summary statistics:
             noteId  createdAtMillis       tweetId  misleadingOther  \
count  4.792100e+04     4.792100e+04  4.792100e+04     47921.000000   
mean   1.500276e+18     1.646529e+12  1.496492e+18         0.055070   
std    8.555401e+16     2.039767e+10  9.411081e+16         0.228119   
min    1.352797e+18     1.611367e+12  1.305232e+10         0.000000   
25%    1.430334e+18     1.629853e+12  1.428068e+18         0.000000   
50%    1.506713e+18     1.648063e+12  1.503337e+18         0.000000   
75%    1.588651e+18     1.667599e+12  1.587887e+18         0.000000   
max    1.620580e+18     1.675211e+12  1.620550e+18         1.000000   

       misleadingFactualError  misleadingManipulatedMedia  \
count            47921.000000                47921.000000   
mean                 0.504977                    0.048663   
std                  0.499980                    0.215166   
min                  0.000000                    0.000000   
25%                  0.000000

In [3]:
# Check for missing values
print("Missing values in notes:")
print(notes.isnull().sum())
print("\nMissing values in ratings:")
print(ratings.isnull().sum())



Missing values in notes:
noteId                                        0
noteAuthorParticipantId                       0
createdAtMillis                               0
tweetId                                       0
classification                                0
believable                                17905
harmful                                   17905
validationDifficulty                      17905
misleadingOther                               0
misleadingFactualError                        0
misleadingManipulatedMedia                    0
misleadingOutdatedInformation                 0
misleadingMissingImportantContext             0
misleadingUnverifiedClaimAsFact               0
misleadingSatire                              0
notMisleadingOther                            0
notMisleadingFactuallyCorrect                 0
notMisleadingOutdatedButNotWhenWritten        0
notMisleadingClearlySatire                    0
notMisleadingPersonalOpinion                  0
trustworthySour

In [4]:
# Check the distribution of values in each column
print("Value counts for notes:")
for column in notes.columns:
    print(f"{column}:")
    print(notes[column].value_counts())
    print("\n")

print("Value counts for ratings:")
for column in ratings.columns:
    print(f"{column}:")
    print(ratings[column].value_counts())
    print("\n")

Value counts for notes:
noteId:
1537142913737428992    1
1358974469977628676    1
1358214251664572416    1
1358215970725191687    1
1358483601898225672    1
                      ..
1430978974935568391    1
1431376314351689729    1
1434350047764926471    1
1437237456949104644    1
1619367452684521473    1
Name: noteId, Length: 47921, dtype: int64


noteAuthorParticipantId:
114D3959B7A1FAF013A9773725D3AC9653F490590C8B76CB7CE232B010DE900E    1841
FF81A02BFA08DB8505E055E3EDA8ED4D716F4242CE334472ADC3EE9BE15D8553     831
ADD6D84E1F57E6163E655F2B417E4696E606ADB54DD1EDE679CED0D4F3222ABD     696
46D893DFC474111C12B5EF54ADED5291021E07B5BC4D944EB4F35D18A93785C7     541
464F514091F013106E190C712492FDDD7A2F35D38B0E762F31374CC21B9FD858     457
                                                                    ... 
6EDE44B43B5ACB9C8D81520C37C6CEACCD5EEC41EF3100238D6AB45D16B70041       1
38C39A4F7C28152B75B721C491E29C5A120F3F7CA40FDC44C72D490D8AEF7A34       1
F820DFACA36848A01D976E2D590170B4C08753C8

1670688968125    2
1674445278401    2
1671036027119    2
1671162852869    2
1671209339332    2
                ..
1667177143982    1
1667177132134    1
1667177201984    1
1667177177939    1
1675176914800    1
Name: createdAtMillis, Length: 1138637, dtype: int64


version:
2    1101613
1      37086
Name: version, dtype: int64


agree:
0    1114619
1      24080
Name: agree, dtype: int64


disagree:
0    1129331
1       9368
Name: disagree, dtype: int64


helpful:
0    1114847
1      23852
Name: helpful, dtype: int64


notHelpful:
0    1126088
1      12611
Name: notHelpful, dtype: int64


helpfulnessLevel:
HELPFUL             567794
NOT_HELPFUL         424323
SOMEWHAT_HELPFUL    109496
Name: helpfulnessLevel, dtype: int64


helpfulOther:
0    1113459
1      25240
Name: helpfulOther, dtype: int64


helpfulInformative:
0    1123416
1      15283
Name: helpfulInformative, dtype: int64


helpfulClear:
0    680494
1    458205
Name: helpfulClear, dtype: int64


helpfulEmpathetic:
0    1131665
1 

In [5]:
notes.columns

Index(['noteId', 'noteAuthorParticipantId', 'createdAtMillis', 'tweetId',
       'classification', 'believable', 'harmful', 'validationDifficulty',
       'misleadingOther', 'misleadingFactualError',
       'misleadingManipulatedMedia', 'misleadingOutdatedInformation',
       'misleadingMissingImportantContext', 'misleadingUnverifiedClaimAsFact',
       'misleadingSatire', 'notMisleadingOther',
       'notMisleadingFactuallyCorrect',
       'notMisleadingOutdatedButNotWhenWritten', 'notMisleadingClearlySatire',
       'notMisleadingPersonalOpinion', 'trustworthySources', 'summary'],
      dtype='object')

In [8]:
columns_to_drop = ['misleadingOther', 'misleadingFactualError', 'misleadingManipulatedMedia',
                   'misleadingOutdatedInformation', 'misleadingMissingImportantContext',
                   'misleadingUnverifiedClaimAsFact', 'misleadingSatire', 'notMisleadingOther',
                   'notMisleadingFactuallyCorrect', 'notMisleadingOutdatedButNotWhenWritten',
                   'notMisleadingClearlySatire', 'notMisleadingPersonalOpinion']
notes = notes.drop(columns_to_drop, axis=1)

In [11]:
notes = notes.drop(['noteAuthorParticipantId', 'createdAtMillis'], axis=1)

In [13]:
notes.columns

Index(['noteId', 'tweetId', 'classification', 'believable', 'harmful',
       'validationDifficulty', 'trustworthySources', 'summary'],
      dtype='object')

In [14]:
notes

Unnamed: 0,noteId,tweetId,classification,believable,harmful,validationDifficulty,trustworthySources,summary
0,1537142913737428992,1377030478167937024,MISINFORMED_OR_POTENTIALLY_MISLEADING,BELIEVABLE_BY_MANY,CONSIDERABLE_HARM,EASY,1,Forbes has a good rundown of the investigation...
1,1537145358521839617,1536848327979016193,NOT_MISLEADING,,,,0,They are expressing a personal opinion in a st...
2,1537147343715282945,1537080831751102467,MISINFORMED_OR_POTENTIALLY_MISLEADING,BELIEVABLE_BY_MANY,LITTLE_HARM,EASY,1,Teslas purchased after 12/31/19 are not eligib...
3,1537204430730211328,1537196168953974784,MISINFORMED_OR_POTENTIALLY_MISLEADING,BELIEVABLE_BY_MANY,CONSIDERABLE_HARM,EASY,1,The Jan 6th riots were encouraged by the sitti...
4,1540422295029551104,1540087463099736065,MISINFORMED_OR_POTENTIALLY_MISLEADING,BELIEVABLE_BY_MANY,CONSIDERABLE_HARM,EASY,1,The Committee has been found by numerous court...
...,...,...,...,...,...,...,...,...
47916,1598885729853218817,1598861004611338240,MISINFORMED_OR_POTENTIALLY_MISLEADING,,,,1,While the Trump administration repeatedly deni...
47917,1617728296350785536,1617518566953349120,MISINFORMED_OR_POTENTIALLY_MISLEADING,,,,0,"As the thread describes, teachers are being to..."
47918,1616864184435675136,1616454061666230274,NOT_MISLEADING,,,,0,This tweet is clearly expressing a personal op...
47919,1617008572591734784,1614309627465719809,MISINFORMED_OR_POTENTIALLY_MISLEADING,,,,1,The boy involved in this incident was not &quo...


In [15]:
print("Classification value counts:")
print(notes['classification'].value_counts())

Classification value counts:
MISINFORMED_OR_POTENTIALLY_MISLEADING    40515
NOT_MISLEADING                            7406
Name: classification, dtype: int64


In [16]:
print("\nBelievable value counts:")
print(notes['believable'].value_counts())


Believable value counts:
BELIEVABLE_BY_MANY    27776
BELIEVABLE_BY_FEW      2240
Name: believable, dtype: int64


In [17]:
print("\nHarmful value counts:")
print(notes['harmful'].value_counts())


Harmful value counts:
CONSIDERABLE_HARM    22202
LITTLE_HARM           7814
Name: harmful, dtype: int64


In [18]:
print("\nValidation difficulty value counts:")
print(notes['validationDifficulty'].value_counts())


Validation difficulty value counts:
EASY           20429
CHALLENGING     9587
Name: validationDifficulty, dtype: int64


In [19]:
print("\nTrustworthy sources value counts:")
print(notes['trustworthySources'].value_counts())


Trustworthy sources value counts:
1    37877
0    10044
Name: trustworthySources, dtype: int64
