In [28]:
import pandas as pd
import numpy as np

twitter_training_file = "Data/twitter_training.csv"
twitter_validation_file = "Data/twitter_validation.csv"

df = pd.read_csv(twitter_training_file)
df.head()

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [8]:
print("Shape:", df.shape)
print("---------------------------------------")
print("Columns:", df.columns)
print("---------------------------------------")
print("Datatypes:", df.dtypes)

Shape: (74681, 4)
---------------------------------------
Columns: Index(['2401', 'Borderlands', 'Positive',
       'im getting on borderlands and i will murder you all ,'],
      dtype='object')
---------------------------------------
Datatypes: 2401                                                      int64
Borderlands                                              object
Positive                                                 object
im getting on borderlands and i will murder you all ,    object
dtype: object


In [9]:
# column names are little weird.....let me do some initial exploration
df["Borderlands"].unique()

array(['Borderlands', 'CallOfDutyBlackopsColdWar', 'Amazon', 'Overwatch',
       'Xbox(Xseries)', 'NBA2K', 'Dota2', 'PlayStation5(PS5)',
       'WorldOfCraft', 'CS-GO', 'Google', 'AssassinsCreed', 'ApexLegends',
       'LeagueOfLegends', 'Fortnite', 'Microsoft', 'Hearthstone',
       'Battlefield', 'PlayerUnknownsBattlegrounds(PUBG)', 'Verizon',
       'HomeDepot', 'FIFA', 'RedDeadRedemption(RDR)', 'CallOfDuty',
       'TomClancysRainbowSix', 'Facebook', 'GrandTheftAuto(GTA)',
       'MaddenNFL', 'johnson&johnson', 'Cyberpunk2077',
       'TomClancysGhostRecon', 'Nvidia'], dtype=object)

In [10]:
df["2401"].unique()

array([2401, 2402, 2403, ..., 9198, 9199, 9200])

In [11]:
df["Positive"].unique()

array(['Positive', 'Neutral', 'Negative', 'Irrelevant'], dtype=object)

In [15]:
df["im getting on borderlands and i will murder you all ,"].unique()

array(['I am coming to the borders and I will kill you all,',
       'im getting on borderlands and i will kill you all,',
       'im coming on borderlands and i will murder you all,', ...,
       'Just realized the windows partition of my Mac is now 6 years behind on Nvidia drivers and I have no idea how he didn’t notice',
       'Just realized between the windows partition of my Mac is like being 6 years behind on Nvidia drivers and cars I have no fucking idea how I ever didn ’ t notice',
       'Just like the windows partition of my Mac is like 6 years behind on its drivers So you have no idea how I didn’t notice'],
      dtype=object)

### Conclusion 1 : Change the column names + make the current column name the first row of data
#### (HAHAHA ok, kaggle had the names on the bottom, but its accurate to what i have)

- Column 2401 --> Tweet ID
- Column Borderlands --> Topic
- Column Positive --> Sentiment
- Column "..." --> Tweet/Text

In [29]:
###### TRANSFORM DATA 
column_but_row_value = df.columns 

# rename columns using .rename(columns = {new dictionary mapping})
df = df.rename(columns = {"2401" : "Tweet ID",
                  "Borderlands" : "Topic",
                  "Positive" : "Sentiment",
                  "im getting on borderlands and i will murder you all ," : "Tweet"}) 
df.head()

Unnamed: 0,Tweet ID,Topic,Sentiment,Tweet
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [36]:
df.shape

(74681, 4)

In [30]:
print(column_but_row_value)

Index(['2401', 'Borderlands', 'Positive',
       'im getting on borderlands and i will murder you all ,'],
      dtype='object')


In [31]:
column_but_row_value[0]

'2401'

In [32]:
# creating df -> pd.DataFrame(dictionary whose value maps to a list)
df2 = pd.DataFrame({
    "Tweet ID" : [column_but_row_value[0]],
    "Topic" : [column_but_row_value[1]],
    "Sentiment" : [column_but_row_value[2]],
    "Tweet" : [column_but_row_value[3]]
})

df2.head()

Unnamed: 0,Tweet ID,Topic,Sentiment,Tweet
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...


In [37]:
df2.shape

(1, 4)

In [34]:
# merge the two df to create the clean column dataset
# concat df and df2, axis = 0 = stacks df row-wise, ignore_index = True = ignore the index, we're not stacking/combining based on index value
combined_df = pd.concat([df, df2], axis = 0, ignore_index = True)
combined_df.head()

Unnamed: 0,Tweet ID,Topic,Sentiment,Tweet
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [78]:
combined_df["Tweet ID"].nunique()

12448

In [79]:
# CHECKING to see that the column combined contains the original column values
combined_df[combined_df["Tweet ID"] == "2401"]

Unnamed: 0,Tweet ID,Topic,Sentiment,Tweet
74681,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...


In [38]:
# combined_df should have the shape of: (74682, 4)
combined_df.shape

(74682, 4)

In [53]:
# export this dataframe
combined_df.to_csv("Data/twitter_training (column fixed).csv", index = False) # index = False = prevents index to be written into file

### Does twitter_validation.csv have the same column naming issue?

Edit: YES

In [40]:
df_validation = pd.read_csv(twitter_validation_file)
df_validation.head()

Unnamed: 0,3364,Facebook,Irrelevant,"I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣"
0,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
1,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
2,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
3,4433,Google,Neutral,Now the President is slapping Americans in the...
4,6273,FIFA,Negative,Hi @EAHelp I’ve had Madeleine McCann in my cel...


In [41]:
df_validation.shape

(999, 4)

In [42]:
df_validation["Facebook"].unique()

array(['Amazon', 'Microsoft', 'CS-GO', 'Google', 'FIFA', 'MaddenNFL',
       'TomClancysRainbowSix', 'AssassinsCreed', 'CallOfDuty', 'Dota2',
       'Verizon', 'NBA2K', 'Nvidia', 'GrandTheftAuto(GTA)',
       'RedDeadRedemption(RDR)', 'Hearthstone', 'ApexLegends',
       'Overwatch', 'PlayerUnknownsBattlegrounds(PUBG)', 'Borderlands',
       'PlayStation5(PS5)', 'johnson&johnson', 'Fortnite',
       'Xbox(Xseries)', 'CallOfDutyBlackopsColdWar', 'Facebook',
       'HomeDepot', 'Cyberpunk2077', 'TomClancysGhostRecon',
       'WorldOfCraft', 'LeagueOfLegends', 'Battlefield'], dtype=object)

In [44]:
df_validation.columns

Index(['3364', 'Facebook', 'Irrelevant',
       'I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣'],
      dtype='object')

In [45]:
validation_col_transform_row = df_validation.columns
df_validation = df_validation.rename(columns = {
    "3364" : "Tweet ID",
    "Facebook" : "Topic",
    "Irrelevant" : "Sentiment",
    "I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣" : "Tweet"
})

df_validation.head()

Unnamed: 0,Tweet ID,Topic,Sentiment,Tweet
0,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
1,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
2,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
3,4433,Google,Neutral,Now the President is slapping Americans in the...
4,6273,FIFA,Negative,Hi @EAHelp I’ve had Madeleine McCann in my cel...


In [47]:
df_validation2 = pd.DataFrame({
    "Tweet ID" : [validation_col_transform_row[0]],
    "Topic" : [validation_col_transform_row[1]],
    "Sentiment" : [validation_col_transform_row[2]],
    "Tweet" : [validation_col_transform_row[3]]
})

df_validation2.head()

Unnamed: 0,Tweet ID,Topic,Sentiment,Tweet
0,3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...


In [48]:
df_validation_combined = pd.concat([df_validation, df_validation2], axis = 0, ignore_index = True)

In [49]:
df_validation_combined.head()

Unnamed: 0,Tweet ID,Topic,Sentiment,Tweet
0,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
1,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
2,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
3,4433,Google,Neutral,Now the President is slapping Americans in the...
4,6273,FIFA,Negative,Hi @EAHelp I’ve had Madeleine McCann in my cel...


In [50]:
df_validation_combined.shape

(1000, 4)

In [54]:
df_validation_combined.dtypes

Tweet ID     object
Topic        object
Sentiment    object
Tweet        object
dtype: object

In [56]:
df_validation_combined.shape[0]

1000

In [69]:
df_validation_combined["Tweet ID"].nunique() # every tweet id is unique

1000

In [75]:
# checking to see if the value exists in the final combined value
df_validation_combined["Tweet ID"] == ""


In [77]:
df_validation_combined[df_validation_combined["Tweet ID"] == "3364"]

Unnamed: 0,Tweet ID,Topic,Sentiment,Tweet
999,3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...


In [52]:
df_validation_combined.to_csv("Data/twitter_validation (column fixed).csv", index = False)