## Loading Data

In [None]:
df = spark.read.csv('/FileStore/tables/klapeye_global_terrorism.csv',header=True,inferSchema=True)
df.show(5)

+----------+--------+------------------+----------------+--------------+------+--------------------+-------+----+--------------------+--------------------+---------+
|      DATE| COUNTRY|             STATE|            CITY|     SUBREGION|REGION|         PERPETRATOR|INJURED|DEAD|         DESCRIPTION|         COORDINATES| CATEGORY|
+----------+--------+------------------+----------------+--------------+------+--------------------+-------+----+--------------------+--------------------+---------+
|2023-04-02|  Russia|              null|Saint Petersburg|Eastern Europe|Europe|       Darya Trepova|     42|   1|During a speech b...|64.6863136,97.745...|Explosion|
|2023-03-06|Pakistan|             Sindh|         Karachi| Southern Asia|  Asia|Tehreek-e-Jihad P...|     13|   9|A terrorist attac...|24.8546842,67.020...|Explosion|
|2023-02-17|Pakistan|             Sindh|         Karachi| Southern Asia|  Asia|Tehrik-i-Taliban ...|     16|   5|The 2023 Karachi ...|24.8546842,67.020...|  Assault|
|202

In [None]:
df.printSchema()

root
 |-- DATE: date (nullable = true)
 |-- COUNTRY: string (nullable = true)
 |-- STATE: string (nullable = true)
 |-- CITY: string (nullable = true)
 |-- SUBREGION: string (nullable = true)
 |-- REGION: string (nullable = true)
 |-- PERPETRATOR: string (nullable = true)
 |-- INJURED: integer (nullable = true)
 |-- DEAD: integer (nullable = true)
 |-- DESCRIPTION: string (nullable = true)
 |-- COORDINATES: string (nullable = true)
 |-- CATEGORY: string (nullable = true)



In [None]:
print(type(df))

<class 'pyspark.sql.dataframe.DataFrame'>


In [None]:
df.dtypes

Out[4]: [('DATE', 'date'),
 ('COUNTRY', 'string'),
 ('STATE', 'string'),
 ('CITY', 'string'),
 ('SUBREGION', 'string'),
 ('REGION', 'string'),
 ('PERPETRATOR', 'string'),
 ('INJURED', 'int'),
 ('DEAD', 'int'),
 ('DESCRIPTION', 'string'),
 ('COORDINATES', 'string'),
 ('CATEGORY', 'string')]

In [None]:
df.count()

Out[5]: 27177

## Partitioning Dataframe

In [None]:
part_df = df.write.format('csv').option('header',True).partitionBy('COUNTRY').mode('overwrite').save('/datapartition/country')

##  loading Partitioned data

In [None]:
part_df = spark.read.csv('/datapartition/country',header=True,inferSchema=True)
part_df.show(5)

+----------+-------------------+-------+------------+------+--------------------+-------+----+--------------------+--------------------+---------+-------+
|      DATE|              STATE|   CITY|   SUBREGION|REGION|         PERPETRATOR|INJURED|DEAD|         DESCRIPTION|         COORDINATES| CATEGORY|COUNTRY|
+----------+-------------------+-------+------------+------+--------------------+-------+----+--------------------+--------------------+---------+-------+
|2022-01-21| Diyala Governorate|   null|Western Asia|  Asia|Islamic State ins...|   null|  11|ISIL gunmen storm...|34.0228719,45.104...|  Assault|   Iraq|
|2021-11-07|Baghdad Governorate|Baghdad|Western Asia|  Asia|   Kata'ib Hezbollah|      6|   0|A failed attempt ...|33.3024309,44.378...|Explosion|   Iraq|
|2021-01-21|Baghdad Governorate|Baghdad|Western Asia|  Asia|       Islamic State|    110|  34|Two suicide bombe...|33.3024309,44.378...|Explosion|   Iraq|
|2018-04-12|               null|   null|Western Asia|  Asia|       Isl

## Loading data Country='India'

In [None]:
ind_df = spark.read.csv('/datapartition/country/COUNTRY=India',header=True,inferSchema=True)
ind_df.show(5)

+----------+-----------------+----------+-------------+------+--------------------+-------+----+--------------------+--------------------+---------+
|      DATE|            STATE|      CITY|    SUBREGION|REGION|         PERPETRATOR|INJURED|DEAD|         DESCRIPTION|         COORDINATES| CATEGORY|
+----------+-----------------+----------+-------------+------+--------------------+-------+----+--------------------+--------------------+---------+
|2022-06-28|        Rajasthan|   Udaipur|Southern Asia|  Asia|           Islamists|      0|   1|A Hindu tailor wa...|24.61263988748612...|  Assault|
|2021-04-03|     Chhattisgarh|      null|Southern Asia|  Asia|            Naxalite|     32|  22|The 2021 Sukma-Bi...|21.6637359,81.840...|Explosion|
|2019-10-29|Jammu and Kashmir|    Kulgam|Southern Asia|  Asia|Hizbul Mujahideen...|      1|   7|Eight Bengali Mus...|33.66980055000000...|     null|
|2019-06-12|Jammu and Kashmir|      null|Southern Asia|  Asia|Hizbul Mujahideen...|      4|   5|Militants 

## Filtering

In [None]:
filter_df = part_df.filter((part_df['COUNTRY']=='India') & (part_df['INJURED']>0))
filter_df.show(5)

+----------+-----------------+-------+-------------+------+--------------------+-------+----+--------------------+--------------------+---------+-------+
|      DATE|            STATE|   CITY|    SUBREGION|REGION|         PERPETRATOR|INJURED|DEAD|         DESCRIPTION|         COORDINATES| CATEGORY|COUNTRY|
+----------+-----------------+-------+-------------+------+--------------------+-------+----+--------------------+--------------------+---------+-------+
|2021-04-03|     Chhattisgarh|   null|Southern Asia|  Asia|            Naxalite|     32|  22|The 2021 Sukma-Bi...|21.6637359,81.840...|Explosion|  India|
|2019-10-29|Jammu and Kashmir| Kulgam|Southern Asia|  Asia|Hizbul Mujahideen...|      1|   7|Eight Bengali Mus...|33.66980055000000...|     null|  India|
|2019-06-12|Jammu and Kashmir|   null|Southern Asia|  Asia|Hizbul Mujahideen...|      4|   5|Militants killed ...|34.0747444,74.820...|  Assault|  India|
|2019-03-07|Jammu and Kashmir|   null|Southern Asia|  Asia|                n

## Creating Widgets

In [None]:
dbutils.widgets.combobox('Country','India',['India','Pakistan','Afghanistan','Iran','Israel','Turkey','Russia','United States'],'Choose Country')

In [None]:
coun = dbutils.widgets.get('Country')

In [None]:
rdf = df.filter(df['COUNTRY']==coun)
rdf.show()

+----------+-------+-----------------+----------+-------------+------+--------------------+-------+----+--------------------+--------------------+---------+
|      DATE|COUNTRY|            STATE|      CITY|    SUBREGION|REGION|         PERPETRATOR|INJURED|DEAD|         DESCRIPTION|         COORDINATES| CATEGORY|
+----------+-------+-----------------+----------+-------------+------+--------------------+-------+----+--------------------+--------------------+---------+
|2022-06-28|  India|        Rajasthan|   Udaipur|Southern Asia|  Asia|           Islamists|      0|   1|A Hindu tailor wa...|24.61263988748612...|  Assault|
|2021-04-03|  India|     Chhattisgarh|      null|Southern Asia|  Asia|            Naxalite|     32|  22|The 2021 Sukma-Bi...|21.6637359,81.840...|Explosion|
|2019-10-29|  India|Jammu and Kashmir|    Kulgam|Southern Asia|  Asia|Hizbul Mujahideen...|      1|   7|Eight Bengali Mus...|33.66980055000000...|     null|
|2019-06-12|  India|Jammu and Kashmir|      null|Southern 

## Salting

In [None]:
salt_df = df.groupBy('REGION').agg({'INJURED':'sum'})
salt_df.show()

+--------+------------+
|  REGION|sum(INJURED)|
+--------+------------+
|  Europe|       18727|
|  Africa|       14472|
|    null|       18814|
|Americas|       12460|
| Oceania|         131|
|    Asia|      105506|
+--------+------------+



In [None]:
from pyspark.sql import functions as f
newdf = df.withColumn('Salt_key',f.concat(df['REGION'],f.lit('_'),f.monotonically_increasing_id()%10))
newdf.show(5)

+----------+--------+------------------+----------------+--------------+------+--------------------+-------+----+--------------------+--------------------+---------+--------+
|      DATE| COUNTRY|             STATE|            CITY|     SUBREGION|REGION|         PERPETRATOR|INJURED|DEAD|         DESCRIPTION|         COORDINATES| CATEGORY|Salt_key|
+----------+--------+------------------+----------------+--------------+------+--------------------+-------+----+--------------------+--------------------+---------+--------+
|2023-04-02|  Russia|              null|Saint Petersburg|Eastern Europe|Europe|       Darya Trepova|     42|   1|During a speech b...|64.6863136,97.745...|Explosion|Europe_0|
|2023-03-06|Pakistan|             Sindh|         Karachi| Southern Asia|  Asia|Tehreek-e-Jihad P...|     13|   9|A terrorist attac...|24.8546842,67.020...|Explosion|  Asia_1|
|2023-02-17|Pakistan|             Sindh|         Karachi| Southern Asia|  Asia|Tehrik-i-Taliban ...|     16|   5|The 2023 Kar

In [None]:
newdf1 = newdf.groupBy(['REGION','Salt_key']).agg({'INJURED':'sum'})
newdf1.show()

+--------+----------+------------+
|  REGION|  Salt_key|sum(INJURED)|
+--------+----------+------------+
|  Africa|  Africa_2|         879|
| Oceania| Oceania_0|           2|
| Oceania| Oceania_4|          19|
| Oceania| Oceania_9|          50|
|    Asia|    Asia_6|       10223|
|Americas|Americas_2|         433|
|  Africa|  Africa_1|         834|
|  Africa|  Africa_3|         868|
|    Asia|    Asia_9|       12403|
|  Europe|  Europe_9|        1878|
|Americas|Americas_6|        1069|
|Americas|Americas_8|        2989|
|  Africa|  Africa_8|        1635|
|    Asia|    Asia_1|        9284|
|    null|      null|       18814|
|  Africa|  Africa_5|         569|
| Oceania| Oceania_2|          37|
| Oceania| Oceania_1|           0|
|  Europe|  Europe_0|        1614|
|  Africa|  Africa_0|        1017|
+--------+----------+------------+
only showing top 20 rows



## Final groupBy

In [None]:
final_newdf = newdf1.groupBy('REGION').agg({'sum(INJURED)':'sum'})
final_newdf.show()

+--------+-----------------+
|  REGION|sum(sum(INJURED))|
+--------+-----------------+
|  Europe|            18727|
|  Africa|            14472|
|    null|            18814|
|Americas|            12460|
| Oceania|              131|
|    Asia|           105506|
+--------+-----------------+



## Finding Null values

In [None]:
display(df.filter(df['CITY'].isNull()).count())

14026

In [None]:
null_counts_per_column = df.select([f.sum(f.col(c).isNull().cast("int")).alias(c) for c in df.columns])
null_counts_per_column.show()

+----+-------+-----+-----+---------+------+-----------+-------+----+-----------+-----------+--------+
|DATE|COUNTRY|STATE| CITY|SUBREGION|REGION|PERPETRATOR|INJURED|DEAD|DESCRIPTION|COORDINATES|CATEGORY|
+----+-------+-----+-----+---------+------+-----------+-------+----+-----------+-----------+--------+
|   0|      0|11754|14026|     5995|  6000|      17024|      5|   1|          0|          7|    5590|
+----+-------+-----+-----+---------+------+-----------+-------+----+-----------+-----------+--------+



In [None]:
from pyspark.sql.functions import count,col
display(df.select([count(col(i)) for i in df.columns]))

count(DATE),count(COUNTRY),count(STATE),count(CITY),count(SUBREGION),count(REGION),count(PERPETRATOR),count(INJURED),count(DEAD),count(DESCRIPTION),count(COORDINATES),count(CATEGORY)
27177,27177,15423,13151,21182,21177,10153,27172,27176,27177,27170,21587


In [None]:
max_inj = df.groupBy('INJURED').count().agg(f.max('count')).collect()[0][0]
max_dead = df.groupBy('DEAD').count().agg(f.max('count')).collect()[0][0]
print(max_inj)
print(max_dead)


16843
16484


## Filling null values in INJURED,DEAD column

In [None]:
df = df.na.fill(value=0,subset=['INJURED','DEAD'])
df.show(5)


+----------+--------+------------------+----------------+--------------+------+--------------------+-------+----+--------------------+--------------------+---------+
|      DATE| COUNTRY|             STATE|            CITY|     SUBREGION|REGION|         PERPETRATOR|INJURED|DEAD|         DESCRIPTION|         COORDINATES| CATEGORY|
+----------+--------+------------------+----------------+--------------+------+--------------------+-------+----+--------------------+--------------------+---------+
|2023-04-02|  Russia|              null|Saint Petersburg|Eastern Europe|Europe|       Darya Trepova|     42|   1|During a speech b...|64.6863136,97.745...|Explosion|
|2023-03-06|Pakistan|             Sindh|         Karachi| Southern Asia|  Asia|Tehreek-e-Jihad P...|     13|   9|A terrorist attac...|24.8546842,67.020...|Explosion|
|2023-02-17|Pakistan|             Sindh|         Karachi| Southern Asia|  Asia|Tehrik-i-Taliban ...|     16|   5|The 2023 Karachi ...|24.8546842,67.020...|  Assault|
|202

## Dropping irrelevant columns

In [None]:
ndf = df.drop('REGION','SUBREGION','COORDINATES')
ndf.show(5)

+----------+--------+------------------+----------------+--------------------+-------+----+--------------------+---------+
|      DATE| COUNTRY|             STATE|            CITY|         PERPETRATOR|INJURED|DEAD|         DESCRIPTION| CATEGORY|
+----------+--------+------------------+----------------+--------------------+-------+----+--------------------+---------+
|2023-04-02|  Russia|              null|Saint Petersburg|       Darya Trepova|     42|   1|During a speech b...|Explosion|
|2023-03-06|Pakistan|             Sindh|         Karachi|Tehreek-e-Jihad P...|     13|   9|A terrorist attac...|Explosion|
|2023-02-17|Pakistan|             Sindh|         Karachi|Tehrik-i-Taliban ...|     16|   5|The 2023 Karachi ...|  Assault|
|2023-01-30|Pakistan|Khyber Pakhtunkhwa|        Peshawar|     Jamaat-ul-Ahrar|    220| 101|A suicide attacke...|Explosion|
|2023-01-27|  Israel|Jerusalem District|       Jerusalem|Palestinian Islam...|      3|   7|Seven people were...|  Assault|
+----------+----

## Creating new column 'Damage'

In [None]:
ndf = ndf.withColumn('DAMAGE',df['INJURED'] + df['DEAD'])
ndf.show(5)

+----------+--------+------------------+----------------+--------------------+-------+----+--------------------+---------+------+
|      DATE| COUNTRY|             STATE|            CITY|         PERPETRATOR|INJURED|DEAD|         DESCRIPTION| CATEGORY|DAMAGE|
+----------+--------+------------------+----------------+--------------------+-------+----+--------------------+---------+------+
|2023-04-02|  Russia|              null|Saint Petersburg|       Darya Trepova|     42|   1|During a speech b...|Explosion|    43|
|2023-03-06|Pakistan|             Sindh|         Karachi|Tehreek-e-Jihad P...|     13|   9|A terrorist attac...|Explosion|    22|
|2023-02-17|Pakistan|             Sindh|         Karachi|Tehrik-i-Taliban ...|     16|   5|The 2023 Karachi ...|  Assault|    21|
|2023-01-30|Pakistan|Khyber Pakhtunkhwa|        Peshawar|     Jamaat-ul-Ahrar|    220| 101|A suicide attacke...|Explosion|   321|
|2023-01-27|  Israel|Jerusalem District|       Jerusalem|Palestinian Islam...|      3|   7

## lets know the total damage,injured & dead in each country

In [None]:
from pyspark.sql import functions as f
n1df = ndf.groupBy('COUNTRY').agg(
    f.sum('INJURED').alias('sum_of_total_injured'),
    f.sum('DEAD').alias('sum_of_total_dead'),
    f.sum('DAMAGE').alias('sum_of_total_damage')
    )
n1df.show()

+--------------------+--------------------+-----------------+-------------------+
|             COUNTRY|sum_of_total_injured|sum_of_total_dead|sum_of_total_damage|
+--------------------+--------------------+-----------------+-------------------+
|              Russia|                5510|             2535|               8045|
|            Paraguay|                   1|                5|                  6|
|               Yemen|                1234|              660|               1894|
|              Sweden|                  23|               39|                 62|
|              Guyana|                   5|                4|                  9|
|         Philippines|                3125|             1181|               4306|
|            Malaysia|                  30|              131|                161|
|    Chechen Republic|                 132|              148|                280|
|              Turkey|                3831|              970|               4801|
|        Phillip

In [None]:
ff = n1df.orderBy(n1df['sum_of_total_damage'].desc())
ff = ff.select('COUNTRY','sum_of_total_damage')
display(ff.head(10))

COUNTRY,sum_of_total_damage
Iraq,65427
India,15524
Pakistan,15491
Afghanistan,15080
Israel,9957
United States,9314
Sri Lanka,8414
Russia,8045
Colombia,7277
Japan,6126


Databricks visualization. Run in Databricks to view.

## dataframe of city column with no null values

In [None]:
n2df = ndf.dropna(subset=['CITY'])
n2df.show(5)

+----------+--------+------------------+----------------+--------------------+-------+----+--------------------+---------+------+
|      DATE| COUNTRY|             STATE|            CITY|         PERPETRATOR|INJURED|DEAD|         DESCRIPTION| CATEGORY|DAMAGE|
+----------+--------+------------------+----------------+--------------------+-------+----+--------------------+---------+------+
|2023-04-02|  Russia|              null|Saint Petersburg|       Darya Trepova|     42|   1|During a speech b...|Explosion|    43|
|2023-03-06|Pakistan|             Sindh|         Karachi|Tehreek-e-Jihad P...|     13|   9|A terrorist attac...|Explosion|    22|
|2023-02-17|Pakistan|             Sindh|         Karachi|Tehrik-i-Taliban ...|     16|   5|The 2023 Karachi ...|  Assault|    21|
|2023-01-30|Pakistan|Khyber Pakhtunkhwa|        Peshawar|     Jamaat-ul-Ahrar|    220| 101|A suicide attacke...|Explosion|   321|
|2023-01-27|  Israel|Jerusalem District|       Jerusalem|Palestinian Islam...|      3|   7

## Finding total damage in every city of each country

In [None]:
ff1 = n2df.groupBy('COUNTRY','CITY').agg(
    f.sum('DAMAGE').alias('damage')
)
ff2 = ff1.orderBy(ff1['COUNTRY'],ff1['damage'].desc())
ff2.show()

+-----------+--------------+------+
|    COUNTRY|          CITY|damage|
+-----------+--------------+------+
|Afghanistan|        Kunduz|   211|
|Afghanistan|         Kabul|    60|
|Afghanistan|        Zaranj|    59|
|Afghanistan|        Gardez|    27|
|Afghanistan|      Charikar|    26|
|Afghanistan|       Maymana|     6|
|Afghanistan|      Asadabad|     6|
|Afghanistan|       Taloqan|     5|
|Afghanistan|       Gereshk|     2|
|Afghanistan|  Baraki Barak|     0|
|    Albania|  Bajram Curri|     4|
|    Albania|        Tirana|     3|
|    Albania|          Fier|     1|
|    Algeria|     Lakhdaria|    71|
|    Algeria|      Boufarik|    66|
|    Algeria|     Birkhadem|    44|
|    Algeria|Khemis Miliana|    42|
|    Algeria|        Dellys|    36|
|    Algeria|        Baraki|    25|
|    Algeria|       Naciria|    24|
+-----------+--------------+------+
only showing top 20 rows



## Finding Top 5 city on behalf of total damage

In [None]:
from pyspark.sql.functions import col

countries = [row['COUNTRY'] for row in ndf.select('COUNTRY').distinct().collect()]
for country in countries:
    n3df = ndf.filter(col('COUNTRY')==country).limit(5)
    n3df = n3df.orderBy(col('COUNTRY'))
    n3df.show() 

+----------+-------+--------------------+----------------+--------------------+-------+----+--------------------+---------+------+
|      DATE|COUNTRY|               STATE|            CITY|         PERPETRATOR|INJURED|DEAD|         DESCRIPTION| CATEGORY|DAMAGE|
+----------+-------+--------------------+----------------+--------------------+-------+----+--------------------+---------+------+
|2023-04-02| Russia|                null|Saint Petersburg|       Darya Trepova|     42|   1|During a speech b...|Explosion|    43|
|2018-02-18| Russia|Republic of Dagestan|         Kizlyar|       Islamic State|      5|   5|Five people were ...|  Assault|    10|
|2017-04-03| Russia|                null|Saint Petersburg|Imam Shamil Batta...|     64|  15|A suicide bombing...|Explosion|    79|
|2016-08-17| Russia|              Moscow|            null|Islamic State of ...|      1|   1|Two men with fire...|     null|     2|
|2009-09-02| Russia|Republic of Dagestan|     Makhachkala|                null|    

## Creating new column YEAR

In [None]:
from pyspark.sql.functions import year

year = ndf.withColumn('YEAR',year(ndf.DATE))
year.show(5)

+----------+--------+------------------+----------------+--------------------+-------+----+--------------------+---------+------+----+
|      DATE| COUNTRY|             STATE|            CITY|         PERPETRATOR|INJURED|DEAD|         DESCRIPTION| CATEGORY|DAMAGE|YEAR|
+----------+--------+------------------+----------------+--------------------+-------+----+--------------------+---------+------+----+
|2023-04-02|  Russia|              null|Saint Petersburg|       Darya Trepova|     42|   1|During a speech b...|Explosion|    43|2023|
|2023-03-06|Pakistan|             Sindh|         Karachi|Tehreek-e-Jihad P...|     13|   9|A terrorist attac...|Explosion|    22|2023|
|2023-02-17|Pakistan|             Sindh|         Karachi|Tehrik-i-Taliban ...|     16|   5|The 2023 Karachi ...|  Assault|    21|2023|
|2023-01-30|Pakistan|Khyber Pakhtunkhwa|        Peshawar|     Jamaat-ul-Ahrar|    220| 101|A suicide attacke...|Explosion|   321|2023|
|2023-01-27|  Israel|Jerusalem District|       Jerusale

## Yearwise most attack

In [None]:
from pyspark.sql import functions as f
ydf = year.groupBy('YEAR').agg(
    f.sum('DAMAGE').alias('Total_damage')
)
ydf = ydf.orderBy(ydf['Total_damage'].desc())
ydf.show()

+----+------------+
|YEAR|Total_damage|
+----+------------+
|2007|       24078|
|2006|       19208|
|2008|       19087|
|2005|       18025|
|2004|       14075|
|2001|       12402|
|2016|       12400|
|1998|       10533|
|2002|        9259|
|2003|        8687|
|2017|        8679|
|1995|        8544|
|2013|        6865|
|1996|        5433|
|2012|        4729|
|2019|        4696|
|2018|        4530|
|1993|        4330|
|1999|        3774|
|1988|        3650|
+----+------------+
only showing top 20 rows



## in which year,which country attacked most

In [None]:
cydf = year.groupBy('COUNTRY','YEAR').agg(
    f.count('DAMAGE').alias('Country_year_attack')
)
cydf = cydf.orderBy(cydf['COUNTRY'],cydf['Country_year_attack'].desc())
cydf.show()

+-----------+----+-------------------+
|    COUNTRY|YEAR|Country_year_attack|
+-----------+----+-------------------+
|Afghanistan|2008|                343|
|Afghanistan|2006|                279|
|Afghanistan|2007|                247|
|Afghanistan|2009|                240|
|Afghanistan|2005|                170|
|Afghanistan|2003|                121|
|Afghanistan|2004|                115|
|Afghanistan|2002|                 58|
|Afghanistan|2017|                 13|
|Afghanistan|2016|                 12|
|Afghanistan|2018|                 11|
|Afghanistan|2020|                  7|
|Afghanistan|2019|                  7|
|Afghanistan|1988|                  5|
|Afghanistan|2022|                  4|
|Afghanistan|1993|                  4|
|Afghanistan|1998|                  4|
|Afghanistan|1979|                  4|
|Afghanistan|2001|                  3|
|Afghanistan|1996|                  3|
+-----------+----+-------------------+
only showing top 20 rows



## Analysis on Pakistan

In [None]:
pak_df = ndf.filter(ndf['COUNTRY']=='Pakistan')
pak_df.show(5) 

+----------+--------+------------------+--------+--------------------+-------+----+--------------------+-----------------+------+
|      DATE| COUNTRY|             STATE|    CITY|         PERPETRATOR|INJURED|DEAD|         DESCRIPTION|         CATEGORY|DAMAGE|
+----------+--------+------------------+--------+--------------------+-------+----+--------------------+-----------------+------+
|2023-03-06|Pakistan|             Sindh| Karachi|Tehreek-e-Jihad P...|     13|   9|A terrorist attac...|        Explosion|    22|
|2023-02-17|Pakistan|             Sindh| Karachi|Tehrik-i-Taliban ...|     16|   5|The 2023 Karachi ...|          Assault|    21|
|2023-01-30|Pakistan|Khyber Pakhtunkhwa|Peshawar|     Jamaat-ul-Ahrar|    220| 101|A suicide attacke...|        Explosion|   321|
|2022-04-26|Pakistan|             Sindh| Karachi|Baloch Liberation...|      4|   4|A female suicide ...|        Explosion|     8|
|2022-03-04|Pakistan|Khyber Pakhtunkhwa|Peshawar|Islamic State – K...|    196|  63|A terro

In [None]:
pak_df1 = pak_df.groupBy('PERPETRATOR').count().orderBy('count',ascending=False).limit(10)
display(pak_df1)

PERPETRATOR,count
,879
Taliban,59
Baloch Liberation Army,47
Lashkar-e-Jhangvi,15
Islamic State,11
Jamaat-ul-Ahrar,8
Research and Analysis Wing (RAW),3
Amal,3
Harkat ul-Mujahedin,3
Mutthaida Qami Movement (MQM),3


Databricks visualization. Run in Databricks to view.

In [None]:
pak_df2 = pak_df.groupBy('CATEGORY').count().orderBy('count',ascending=False)
display(pak_df2)

CATEGORY,count
Explosion,689
,173
Assault,128
"Assault,Explosion",33
Hostage,22
"Assault,Hostage",12
"Explosion,Hostage",3
"Explosion,Accident",1
"Assault,Explosion,Accident",1


Databricks visualization. Run in Databricks to view.

## In pakistan,which city attacked most

In [None]:
pak_state = pak_df.groupBy('STATE', 'CITY').agg({'DAMAGE': 'count'}).orderBy('count(DAMAGE)', ascending=False)
pak_state.show()

+--------------------+----------------+-------------+
|               STATE|            CITY|count(DAMAGE)|
+--------------------+----------------+-------------+
|                null|            null|          335|
|               Sindh|         Karachi|          144|
|         Balochistan|          Quetta|          110|
|  Khyber Pakhtunkhwa|        Peshawar|           89|
|              Punjab|          Lahore|           45|
|Islamabad Capital...|       Islamabad|           43|
|              Punjab|      Rawalpindi|           23|
|         Balochistan|      Dera Bugti|           22|
|         Balochistan|            null|           17|
|Federally Adminis...|            Wana|           16|
|         Balochistan|           Kohlu|           16|
|         Balochistan|         Mastung|           15|
|               Sindh|       Hyderabad|           14|
|  Khyber Pakhtunkhwa|           Bannu|           13|
|  Khyber Pakhtunkhwa|Dera Ismail Khan|           11|
|  Khyber Pakhtunkhwa|      

In [None]:
coun_df = ndf.groupBy('COUNTRY','CATEGORY').agg({'CATEGORY':'count'}).orderBy('count(CATEGORY)',ascending=False).limit(10)
display(coun_df)


COUNTRY,CATEGORY,count(CATEGORY)
Iraq,Explosion,2252
Iraq,Assault,1165
France,Explosion,723
Afghanistan,Explosion,693
Pakistan,Explosion,689
Turkey,Explosion,652
Spain,Explosion,625
United Kingdom,Explosion,589
India,Explosion,589
Colombia,Explosion,573


Databricks visualization. Run in Databricks to view.

## lowering 'Description' Column

In [None]:
from pyspark.sql.functions import lower,upper
ddf = ndf.withColumn('DESCRIPTION',lower(col('DESCRIPTION')))
ddf.show(5)

+----------+--------+------------------+----------------+--------------------+-------+----+--------------------+---------+------+
|      DATE| COUNTRY|             STATE|            CITY|         PERPETRATOR|INJURED|DEAD|         DESCRIPTION| CATEGORY|DAMAGE|
+----------+--------+------------------+----------------+--------------------+-------+----+--------------------+---------+------+
|2023-04-02|  Russia|              null|Saint Petersburg|       Darya Trepova|     42|   1|during a speech b...|Explosion|    43|
|2023-03-06|Pakistan|             Sindh|         Karachi|Tehreek-e-Jihad P...|     13|   9|a terrorist attac...|Explosion|    22|
|2023-02-17|Pakistan|             Sindh|         Karachi|Tehrik-i-Taliban ...|     16|   5|the 2023 karachi ...|  Assault|    21|
|2023-01-30|Pakistan|Khyber Pakhtunkhwa|        Peshawar|     Jamaat-ul-Ahrar|    220| 101|a suicide attacke...|Explosion|   321|
|2023-01-27|  Israel|Jerusalem District|       Jerusalem|Palestinian Islam...|      3|   7

## Finding suicide wise attack in india

In [None]:
ddf1 = ddf.filter((ddf['DESCRIPTION'].contains('suicide')) & (df['COUNTRY']=='India'))
ddf1.show()

+----------+-------+-----------------+--------+--------------------+-------+----+--------------------+-----------------+------+
|      DATE|COUNTRY|            STATE|    CITY|         PERPETRATOR|INJURED|DEAD|         DESCRIPTION|         CATEGORY|DAMAGE|
+----------+-------+-----------------+--------+--------------------+-------+----+--------------------+-----------------+------+
|2019-02-14|  India|Jammu and Kashmir| Pulwama|Jaish-e-Mohammad ...|     35|  40|40 personnel belo...|        Explosion|    75|
|2006-05-23|  India|             null|    null|Hizbul Mujahideen...|     25|   1|in hyderpora, a s...|        Explosion|    26|
|2005-11-14|  India|Jammu and Kashmir|Srinagar|        Al Mansooran|     17|   4|an attack at a la...|Assault,Explosion|    21|
|2005-11-02|  India|Jammu and Kashmir|Srinagar|Jaish-e-Mohammad ...|     18|  10|on the day when t...|        Explosion|    28|
|2004-07-24|  India|Jammu and Kashmir|Srinagar|        Al Mansooran|      2|   5|a suicide bomber ...|As